diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..bc7bcf7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,9 @@
+License for Non-commercial Scientific Research Purposes
+
+IDEA grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under IDEA’s copyright interests to reproduce, distribute, and create derivative works of the text, videos, codes solely for your non-commercial research purposes.
+
+Any other use, in particular any use for commercial, pornographic, military, or surveillance, purposes is prohibited.
+
+Text and visualization results are owned by International Digital Economy Academy (IDEA).
+
+You also need to obey the original license of the dependency models/data used in this service.
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..664d53f
--- /dev/null
+++ b/app.py
@@ -0,0 +1,661 @@
+import shutil
+import subprocess
+
+import torch
+import gradio as gr
+from fastapi import FastAPI
+import os
+from PIL import Image
+import tempfile
+from decord import VideoReader, cpu
+import uvicorn
+from transformers import TextStreamer
+
+import hashlib
+import os
+import sys
+import time
+import warnings
+from pathlib import Path
+from typing import Optional
+from typing import Dict, List, Literal, Optional, Tuple
+from lit_gpt.lora import GPT, Block, Config, lora_filter, mark_only_lora_as_trainable
+
+import lightning as L
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+
+from generate import generate as generate_
+from lit_llama import Tokenizer, LLaMA, LLaMAConfig
+from lit_llama.lora import lora
+from lit_llama.utils import EmptyInitOnDevice
+from lit_gpt.utils import lazy_load
+from scripts.video_dataset.prepare_video_dataset_video_llava import generate_prompt_mlp
+from options import option
+import imageio
+from tqdm import tqdm
+
+from models.multimodal_encoder.builder import build_image_tower, build_video_tower
+from models.multimodal_projector.builder import build_vision_projector
+
+
+title_markdown = ("""
+
MotionLLM: Understanding Human Behaviors from Human Motions and Videos
+
+
😎 Co-first author. Listing order is random. 🤗 Corresponding author.
+
+ 1 THU
+ 2 CUHK (SZ)
+ 3 IDEA Research
+ 4 HKUST
+
+
+
+
+
+
+""")
+
+block_css = """
+#buttons button {
+ min-width: min(120px,100%);
+}
+"""
+
+
+tos_markdown = ("""
+*We are now working to support the motion branch of the MotionLLM model.
+
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content.
+It is forbidden to use the service to generate content that is illegal, harmful, violent, racist, or sexual
+The usage of this service is subject to the IDEA License.
+""")
+
+
+learn_more_markdown = ("""
+### License
+License for Non-commercial Scientific Research Purposes
+
+IDEA grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under IDEA’s copyright interests to reproduce, distribute, and create derivative works of the text, videos, codes solely for your non-commercial research purposes.
+
+Any other use, in particular any use for commercial, pornographic, military, or surveillance, purposes is prohibited.
+
+Text and visualization results are owned by International Digital Economy Academy (IDEA).
+
+You also need to obey the original license of the dependency models/data used in this service.
+""")
+
+
+
+class LlavaMetaModel:
+
+ def __init__(self, config, pretrained_checkpoint):
+ super(LlavaMetaModel, self).__init__()
+ # import pdb; pdb.set_trace()
+ if hasattr(config, "mm_image_tower") or hasattr(config, "image_tower"):
+ self.image_tower = build_image_tower(config, delay_load=True)
+ self.mm_projector = build_vision_projector(config)
+ if hasattr(config, "mm_video_tower") or hasattr(config, "video_tower"):
+ self.video_tower = build_video_tower(config, delay_load=True)
+ self.mm_projector = build_vision_projector(config)
+ self.load_video_tower_pretrained(pretrained_checkpoint)
+
+ def get_image_tower(self):
+ image_tower = getattr(self, 'image_tower', None)
+ if type(image_tower) is list:
+ image_tower = image_tower[0]
+ return image_tower
+
+ def get_video_tower(self):
+ video_tower = getattr(self, 'video_tower', None)
+
+ if type(video_tower) is list:
+ video_tower = video_tower[0]
+ return video_tower
+
+
+ def get_all_tower(self, keys):
+ tower = {key: getattr(self, f'get_{key}_tower') for key in keys}
+ return tower
+
+
+ def load_video_tower_pretrained(self, pretrained_checkpoint):
+ self.mm_projector.load_state_dict(pretrained_checkpoint, strict=True)
+
+
+ def initialize_image_modules(self, model_args, fsdp=None):
+ image_tower = model_args.image_tower
+ mm_vision_select_layer = model_args.mm_vision_select_layer
+ mm_vision_select_feature = model_args.mm_vision_select_feature
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+ self.config.mm_image_tower = image_tower
+
+ image_tower = build_image_tower(model_args)
+
+ if fsdp is not None and len(fsdp) > 0:
+ self.image_tower = [image_tower]
+ else:
+ self.image_tower = image_tower
+
+ self.config.use_mm_proj = True
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+ self.config.mm_hidden_size = image_tower.hidden_size
+ self.config.mm_vision_select_layer = mm_vision_select_layer
+ self.config.mm_vision_select_feature = mm_vision_select_feature
+
+ self.mm_projector = build_vision_projector(self.config)
+
+ if pretrain_mm_mlp_adapter is not None:
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+ def get_w(weights, keyword):
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+
+ def initialize_video_modules(self, model_args, fsdp=None):
+ video_tower = model_args.video_tower
+ mm_vision_select_layer = model_args.mm_vision_select_layer
+ mm_vision_select_feature = model_args.mm_vision_select_feature
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+ self.config.mm_video_tower = video_tower
+
+ video_tower = build_video_tower(model_args)
+
+ if fsdp is not None and len(fsdp) > 0:
+ self.video_tower = [video_tower]
+ else:
+ self.video_tower = video_tower
+
+ self.config.use_mm_proj = True
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+ self.config.mm_hidden_size = video_tower.hidden_size
+ self.config.mm_vision_select_layer = mm_vision_select_layer
+ self.config.mm_vision_select_feature = mm_vision_select_feature
+
+ self.mm_projector = build_vision_projector(self.config)
+
+ if pretrain_mm_mlp_adapter is not None:
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+ def get_w(weights, keyword):
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+
+ def encode_images(self, images):
+ image_features = self.get_image_tower()(images)
+ image_features = self.mm_projector(image_features)
+ return image_features
+
+ def encode_videos(self, videos):
+ # import pdb; pdb.set_trace()
+ # videos: torch.Size([1, 3, 8, 224, 224])
+ video_features = self.get_video_tower()(videos) # torch.Size([1, 2048, 1024])
+ video_features = self.mm_projector(video_features.float()) # torch.Size([1, 2048, 4096])
+ return video_features
+
+ def get_multimodal_embeddings(self, X_modalities):
+ Xs, keys= X_modalities
+
+ X_features = getattr(self, f'encode_{keys[0]}s')(Xs) # expand to get batchsize
+
+ return X_features
+
+
+class Projection(nn.Module):
+ def __init__(self, ):
+ super().__init__()
+ self.linear_proj = nn.Linear(512, 4096)
+ def forward(self, x):
+ return self.linear_proj(x)
+
+
+class ProjectionNN(nn.Module):
+ def __init__(self, ):
+ super().__init__()
+ self.proj = nn.Sequential(
+ nn.Linear(512, 4096),
+ nn.GELU(),
+ nn.Linear(4096, 4096)
+ )
+ def forward(self, x):
+ return self.proj(x)
+
+
+class Conversation():
+ def __init__(self, output=None, input_prompt=None, prompt=None):
+ if output is None:
+ self.messages = []
+ else:
+ self.messages = []
+ self.append_message(prompt, input_prompt, output)
+
+ def append_message(self, output, input_prompt, prompt, show_images):
+ # print(output)
+ # print(input_prompt)
+ # print(prompt)
+ # print(show_images)
+ self.messages.append((output, input_prompt, prompt, show_images))
+
+ def to_gradio_chatbot(self, show_images=None, output_text=None):
+ # return a list
+ if show_images is None:
+ show_images = self.messages[-1][3]
+ output_text = self.messages[-1][0]
+ return [
+ [show_images, output_text]
+ ]
+
+ def get_info(self):
+ return self.messages[-1][0], self.messages[-1][1]
+
+
+class ConversationBuffer():
+ def __init__(self, input_text):
+ self.buffer_ = []
+ self.buffer.append(input_text)
+
+
+def init_conv():
+ conv = Conversation()
+ return conv
+
+
+def get_processor(X, config, device, pretrained_checkpoint_tower, model_path = 'LanguageBind/MotionLLM-7B'):
+ mm_backbone_mlp_model = LlavaMetaModel(config, pretrained_checkpoint_tower)
+
+ processor = {}
+ if 'Image' in X:
+ image_tower = mm_backbone_mlp_model.get_image_tower() # LanguageBindImageTower()
+ if not image_tower.is_loaded:
+ image_tower.load_model()
+ image_tower.to(device=device, dtype=torch.float16)
+ image_processor = image_tower.image_processor
+ processor['image'] = image_processor
+ if 'Video' in X:
+ video_tower = mm_backbone_mlp_model.get_video_tower()
+ if not video_tower.is_loaded:
+ video_tower.load_model()
+ video_tower.to(device=device, dtype=torch.float16)
+ video_processor = video_tower.video_processor
+ processor['video'] = video_processor
+
+ return mm_backbone_mlp_model, processor
+
+
+def motionllm(
+ args,
+ input_video_path: str,
+ text_en_in: str,
+ quantize: Optional[str] = None,
+ dtype: str = "float32",
+ max_new_tokens: int = 200,
+ top_k: int = 200,
+ temperature: float = 0.8,
+ accelerator: str = "auto",):
+
+ video_tensor = video_processor(input_video_path, return_tensors='pt')['pixel_values']
+
+ if type(video_tensor) is list:
+ tensor = [video.to('cuda', dtype=torch.float16) for video in video_tensor]
+ else:
+ tensor = video_tensor.to('cuda', dtype=torch.float16) # (1,3,8,224,224)
+
+ X_modalities = [tensor,['video']]
+ video_feature = mm_backbone_mlp_model.get_multimodal_embeddings(X_modalities)
+ prompt = text_en_in
+ input_prompt = prompt
+
+ sample = {"instruction": prompt, "input": input_video_path}
+
+ prefix = generate_prompt_mlp(sample)
+ pre = torch.cat((tokenizer.encode(prefix.split('INPUT_VIDEO: ')[0] + "\n", bos=True, eos=False, device=model.device).view(1, -1), tokenizer.encode("INPUT_VIDEO: ", bos=False, eos=False, device=model.device).view(1, -1)), dim=1)
+
+ prompt = (pre, ". ASSISTANT: ")
+ encoded = (prompt[0], video_feature[0], tokenizer.encode(prompt[1], bos=False, eos=False, device=model.device).view(1, -1))
+
+ t0 = time.perf_counter()
+
+ output_seq = generate_(
+ model,
+ idx=encoded,
+ max_seq_length=4096,
+ max_new_tokens=max_new_tokens,
+ temperature=temperature,
+ top_k=top_k,
+ eos_id=tokenizer.eos_id,
+ tokenizer = tokenizer,
+ )
+ outputfull = tokenizer.decode(output_seq)
+ output = outputfull.split("ASSISTANT:")[-1].strip()
+ print("================================")
+ print(output)
+ print("================================")
+
+ return output, input_prompt, prompt
+
+
+def save_image_to_local(image):
+ filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
+ image = Image.open(image)
+ image.save(filename)
+ # print(filename)
+ return filename
+
+
+def save_video_to_local(video_path):
+ filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
+ shutil.copyfile(video_path, filename)
+ return filename
+
+
+def generate(image1, video, textbox_in, first_run, state, images_tensor):
+ flag = 1
+
+ image1 = image1 if image1 else "none"
+ video = video if video else "none"
+
+ if type(state) is not Conversation:
+ state = init_conv()
+ images_tensor = [[], []]
+
+ first_run = False if len(state.messages) > 0 else True
+ text_en_in = textbox_in.replace("picture", "image")
+ output, input_prompt, prompt = motionllm(args, video, text_en_in)
+
+ text_en_out = output
+ textbox_out = text_en_out
+
+ show_images = ""
+ if os.path.exists(image1):
+ filename = save_image_to_local(image1)
+ show_images += f' '
+
+ if os.path.exists(video):
+ filename = save_video_to_local(video)
+ show_images += f' '
+
+ show_images = textbox_in + "\n" + show_images
+ state.append_message(output, input_prompt, prompt, show_images)
+
+ torch.cuda.empty_cache()
+
+ return (state, state.to_gradio_chatbot(show_images, output), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(image1) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
+
+def regenerate(state):
+ if len(state.messages) > 0:
+ tobot = state.to_gradio_chatbot()
+ tobot[-1][1] = None
+ textbox = state.messages[-1][1]
+ state.messages.pop(-1)
+ return state, tobot, False, textbox
+ return (state, [], True)
+
+
+def clear_history(state):
+ state = init_conv()
+ try:
+ tgt = state.to_gradio_chatbot()
+ except:
+ tgt = [None, None]
+ return (gr.update(value=None, interactive=True),
+ gr.update(value=None, interactive=True),\
+ gr.update(value=None, interactive=True),\
+ True, state, tgt, [[], []])
+
+
+def get_md5(file_path):
+ hash_md5 = hashlib.md5()
+ with open(file_path, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_md5.update(chunk)
+ return hash_md5.hexdigest()
+
+
+def logging_up(video, state):
+ try:
+ state.get_info()
+ except:
+ return False
+ action = "upvote"
+ # Get the current time
+ current_time = str(time.time())
+
+ # Create an md5 object
+ hash_object = hashlib.md5(current_time.encode())
+
+ # Get the hexadecimal representation of the hash
+ md5_hash = get_md5(video) + "-" + hash_object.hexdigest()
+
+ command = f"cp {video} ./feedback/{action}/mp4/{md5_hash}.mp4"
+ os.system(command)
+ with open (f"./feedback/{action}/txt/{md5_hash}.txt", "w") as f:
+ out, prp = state.get_info()
+ f.write(f"==========\nPrompt: {prp}\n==========\nOutput: {out}==========\n")
+ return True
+
+
+def logging_down(video, state):
+ try:
+ state.get_info()
+ except:
+ return False
+ action = "downvote"
+ # Get the current time
+ current_time = str(time.time())
+
+ # Create an md5 object
+ hash_object = hashlib.md5(current_time.encode())
+
+ # Get the hexadecimal representation of the hash
+ md5_hash = get_md5(video) + "-" + hash_object.hexdigest()
+
+ command = f"cp {video} ./feedback/{action}/mp4/{md5_hash}.mp4"
+ os.system(command)
+ with open (f"./feedback/{action}/txt/{md5_hash}.txt", "w") as f:
+ out, prp = state.get_info()
+ f.write(f"==========\nPrompt: {prp}\n==========\nOutput: {out}==========\n")
+ return True
+
+
+torch.set_float32_matmul_precision("high")
+warnings.filterwarnings('ignore')
+args = option.get_args_parser()
+
+conv_mode = "llava_v1"
+model_path = 'LanguageBind/Video-LLaVA-7B'
+device = 'cuda'
+load_8bit = False
+load_4bit = True
+dtype = torch.float16
+
+if not os.path.exists("temp"):
+ os.makedirs("temp")
+
+lora_path = Path(args.lora_path)
+pretrained_llm_path = Path(f"./checkpoints/vicuna-7b-v1.5/lit_model.pth")
+tokenizer_llm_path = Path("./checkpoints/vicuna-7b-v1.5/tokenizer.model")
+
+# assert lora_path.is_file()
+assert pretrained_llm_path.is_file()
+assert tokenizer_llm_path.is_file()
+
+accelerator = "auto"
+fabric = L.Fabric(accelerator=accelerator, devices=1)
+
+dtype = "float32"
+dt = getattr(torch, dtype, None)
+if not isinstance(dt, torch.dtype):
+ raise ValueError(f"{dtype} is not a valid dtype.")
+dtype = dt
+
+quantize = None
+t0 = time.time()
+
+with EmptyInitOnDevice(
+ device=fabric.device, dtype=dtype, quantization_mode=quantize
+), lora(r=args.lora_r, alpha=args.lora_alpha, dropout=args.lora_dropout, enabled=True):
+ checkpoint_dir = Path("checkpoints/vicuna-7b-v1.5")
+ lora_query = True
+ lora_key = False
+ lora_value = True
+ lora_projection = False
+ lora_mlp = False
+ lora_head = False
+ config = Config.from_name(
+ name=checkpoint_dir.name,
+ r=args.lora_r,
+ alpha=args.lora_alpha,
+ dropout=args.lora_dropout,
+ to_query=lora_query,
+ to_key=lora_key,
+ to_value=lora_value,
+ to_projection=lora_projection,
+ to_mlp=lora_mlp,
+ to_head=lora_head,
+ )
+ model = GPT(config).bfloat16()
+
+mlp_path = args.mlp_path
+pretrained_checkpoint_mlp = torch.load(mlp_path)
+
+X = ['Video']
+
+mm_backbone_mlp_model, processor = get_processor(X, args, 'cuda', pretrained_checkpoint_mlp, model_path = 'LanguageBind/Video-LLaVA-7B')
+video_processor = processor['video']
+
+linear_proj = mm_backbone_mlp_model.mm_projector
+
+# 1. Load the pretrained weights
+pretrained_llm_checkpoint = lazy_load(pretrained_llm_path)
+# 2. Load the fine-tuned LoRA weights
+lora_checkpoint = lazy_load(lora_path)
+# 3. merge the two checkpoints
+model_state_dict = {**pretrained_llm_checkpoint, **lora_checkpoint}
+model.load_state_dict(model_state_dict, strict=True)
+print('Load llm base model from', pretrained_llm_path)
+print('Load lora model from', lora_path)
+
+# load mlp again, to en sure, not neccessary actually
+linear_proj.load_state_dict(pretrained_checkpoint_mlp)
+linear_proj = linear_proj.cuda()
+print('Load mlp model again from', mlp_path)
+print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr)
+
+model.eval()
+model = fabric.setup_module(model)
+linear_proj.eval()
+
+tokenizer = Tokenizer(tokenizer_llm_path)
+print('Load tokenizer from', tokenizer_llm_path)
+
+print(torch.cuda.memory_allocated())
+print(torch.cuda.max_memory_allocated())
+
+
+app = FastAPI()
+
+textbox = gr.Textbox(
+ show_label=False, placeholder="Enter text and press ENTER", container=False
+ )
+
+with gr.Blocks(title='MotionLLM', theme=gr.themes.Default(), css=block_css) as demo:
+ gr.Markdown(title_markdown)
+ state = gr.State()
+ buffer_ = gr.State()
+ first_run = gr.State()
+ images_tensor = gr.State()
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ image1 = gr.State()
+ video = gr.Video(label="Input Video")
+
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
+ gr.Examples(
+ examples=[
+ [
+ f"{cur_dir}/examples/Play_Electric_guitar_16_clip1.mp4",
+ "why is the girl so happy",
+ ],
+ [
+ f"{cur_dir}/examples/guoyoucai.mov",
+ "what is the feeling of him",
+ ],
+ [
+ f"{cur_dir}/examples/sprint_run_18_clip1.mp4",
+ "Why is the man running so fast?",
+ ],
+ [
+ f"{cur_dir}/examples/lift_weight.mp4",
+ "Assume you are a fitness coach, refer to the video of the professional athlete, please analyze specific action essentials in steps and give detailed instruction.",
+ ],
+ [
+ f"{cur_dir}/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4",
+ "wow, can you teach me the motion, step by step in detail",
+ ],
+ [
+ f"{cur_dir}/examples/mabaoguo.mp4",
+ "why is the video funny?",
+ ],
+ [
+ f"{cur_dir}/examples/COBRA_PUSH_UPS_clip2.mp4",
+ "describe the body movement of the woman",
+ ],
+ [
+ f"{cur_dir}/examples/sample_demo_1.mp4",
+ "Why is this video interesting?",
+ ],
+ ],
+ inputs=[video, textbox],
+ )
+
+ with gr.Column(scale=7):
+ chatbot = gr.Chatbot(label="MotionLLM", bubble_full_width=True).style(height=875)
+ with gr.Row():
+ with gr.Column(scale=8):
+ textbox.render()
+ with gr.Column(scale=1, min_width=50):
+ submit_btn = gr.Button(
+ value="Send", variant="primary", interactive=True
+ )
+ with gr.Row(elem_id="buttons") as button_row:
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=True)
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)
+
+ gr.Markdown(tos_markdown)
+ gr.Markdown(learn_more_markdown)
+
+ tmp = gr.State()
+ upvote_btn.click(logging_up, [video, state], [tmp])
+
+ downvote_btn.click(logging_down, [video, state], [tmp])
+
+ submit_btn.click(generate, [image1, video, textbox, first_run, state, images_tensor],
+ [state, chatbot, first_run, textbox, images_tensor, image1, video])
+
+ regenerate_btn.click(regenerate, [state], [state, chatbot, first_run, textbox]).then(
+ generate, [image1, video, textbox, first_run, state, images_tensor], [state, chatbot, first_run, textbox, images_tensor, image1, video])
+
+ clear_btn.click(clear_history, [state],
+ [image1, video, textbox, first_run, state, chatbot, images_tensor])
+
+app = gr.mount_gradio_app(app, demo, path="/")
+uvicorn.run(app, host="0.0.0.0", port=6657)
\ No newline at end of file
diff --git a/examples/COBRA_PUSH_UPS_clip2.mp4 b/examples/COBRA_PUSH_UPS_clip2.mp4
new file mode 100644
index 0000000..de3486d
Binary files /dev/null and b/examples/COBRA_PUSH_UPS_clip2.mp4 differ
diff --git a/examples/Play_Electric_guitar_16_clip1.mp4 b/examples/Play_Electric_guitar_16_clip1.mp4
new file mode 100644
index 0000000..9fe2bf8
Binary files /dev/null and b/examples/Play_Electric_guitar_16_clip1.mp4 differ
diff --git a/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4 b/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4
new file mode 100644
index 0000000..ce83a78
Binary files /dev/null and b/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4 differ
diff --git a/examples/guoyoucai.mov b/examples/guoyoucai.mov
new file mode 100644
index 0000000..b1a2aa8
Binary files /dev/null and b/examples/guoyoucai.mov differ
diff --git a/examples/guoyoucai.mp4 b/examples/guoyoucai.mp4
new file mode 100644
index 0000000..aa1c944
Binary files /dev/null and b/examples/guoyoucai.mp4 differ
diff --git a/examples/lift_weight.mp4 b/examples/lift_weight.mp4
new file mode 100644
index 0000000..dc1766b
Binary files /dev/null and b/examples/lift_weight.mp4 differ
diff --git a/examples/mabaoguo.mp4 b/examples/mabaoguo.mp4
new file mode 100644
index 0000000..a03aa2b
Binary files /dev/null and b/examples/mabaoguo.mp4 differ
diff --git a/examples/sample_demo_1.mp4 b/examples/sample_demo_1.mp4
new file mode 100644
index 0000000..8afbc6c
Binary files /dev/null and b/examples/sample_demo_1.mp4 differ
diff --git a/examples/sprint_run_18_clip1.mp4 b/examples/sprint_run_18_clip1.mp4
new file mode 100644
index 0000000..9845521
Binary files /dev/null and b/examples/sprint_run_18_clip1.mp4 differ
diff --git a/generate.py b/generate.py
new file mode 100755
index 0000000..677a171
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,199 @@
+import sys
+import time
+import warnings
+from pathlib import Path
+from typing import Optional
+
+import lightning as L
+import torch
+
+from lit_llama import LLaMA, Tokenizer
+from lit_llama.utils import EmptyInitOnDevice, lazy_load
+
+
+@torch.no_grad()
+def generate(
+ model: torch.nn.Module,
+ idx: torch.Tensor,
+ max_new_tokens: int,
+ max_seq_length: int,
+ temperature: float = 1.0,
+ top_k: Optional[int] = None,
+ eos_id: Optional[int] = None,
+ tokenizer = None,
+) -> torch.Tensor:
+ """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+
+ The implementation of this function is modified from A. Karpathy's nanoGPT.
+
+ Args:
+ model: The model to use.
+ idx: Tensor of shape (T) with indices of the prompt sequence.
+ max_new_tokens: The number of new tokens to generate.
+ max_seq_length: The maximum sequence length allowed.
+ temperature: Scales the predicted logits by 1 / temperature
+ top_k: If specified, only sample among the tokens with the k highest probabilities
+ eos_id: If specified, stop generating any more token once the token is triggered
+ """
+ # create an empty tensor of the expected final shape and fill in the current tokens
+ # import pdb; pdb.set_trace()
+ if type(idx) == tuple:
+ # import pdb; pdb.set_trace()
+ T = idx[0].shape[-1] + idx[2].shape[-1] + len(idx[1])
+ before_len = idx[0].shape[-1]
+ catted = torch.cat((idx[0], torch.zeros((1, len(idx[1]))).cuda(), idx[2]), dim=1).long()
+ idx = (catted, idx[1], before_len)
+ T_new = T + max_new_tokens
+ # import pdb; pdb.set_trace()
+ empty = torch.empty(T_new, dtype=idx[0].dtype, device=idx[0].device)
+ empty = torch.empty(T_new, dtype=idx[0].dtype, device=idx[0].device)
+ empty[:T] = idx[0]
+ idx = (empty, idx[1], [before_len])
+ # import pdb; pdb.set_trace()
+ else:
+ # import pdb; pdb.set_trace()
+ T = idx.size(0)
+ T_new = T + max_new_tokens
+ empty = torch.empty(T_new, dtype=idx.dtype, device=idx.device)
+ empty[:T] = idx
+ idx = empty
+
+ # generate max_new_tokens tokens
+ # import pdb; pdb.set_trace()
+ for t in range(T, T_new):
+ if type(idx) == tuple:
+ idx_cond = idx[0][:t]
+ tmp = idx_cond if T <= max_seq_length else idx_cond[-max_seq_length:]
+ # import pdb; pdb.set_trace()
+ idx_cond = (tmp.view(1, -1), idx[1].unsqueeze(0), idx[2])
+ else:
+ # ignore the not-filled-yet tokens
+ idx_cond = idx[:t]
+ # if the sequence context is growing too long we must crop it at max_seq_length
+ idx_cond = idx_cond if T <= max_seq_length else idx_cond[-max_seq_length:]
+
+ # forward
+ if type(idx) == tuple:
+ logits = model(idx_cond, maxlen=idx_cond[0].size(1))
+ else:
+ logits = model(idx_cond.view(1, -1))
+ logits = logits[0, -1] / temperature
+
+ # import pdb; pdb.set_trace()
+ # optionally crop the logits to only the top k options
+ if top_k is not None:
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+ logits[logits < v[[-1]]] = -float("Inf")
+
+ probs = torch.nn.functional.softmax(logits, dim=-1)
+ idx_next = torch.multinomial(probs, num_samples=1)
+
+ # concatenate the new generation
+ if type(idx) == tuple:
+ seq = idx[0]
+ seq[t] = idx_next
+ idx = (seq, idx[1], idx[2])
+ else:
+ idx[t] = idx_next
+
+ # if token is triggered, return the output (stop generation)
+ if idx_next == eos_id:
+ if type(idx) == tuple:
+ return idx[0][:t+1]
+ else:
+ return idx[:t + 1] # include the EOS token
+ if type(idx) == tuple:
+ return idx[0]
+ else:
+ return idx
+
+
+def main(
+ prompt: str = "Hello, my name is",
+ *,
+ num_samples: int = 1,
+ max_new_tokens: int = 50,
+ top_k: int = 200,
+ temperature: float = 0.8,
+ checkpoint_path: Optional[Path] = None,
+ tokenizer_path: Optional[Path] = None,
+ model_size: str = "7B",
+ quantize: Optional[str] = None,
+) -> None:
+ """Generates text samples based on a pre-trained LLaMA model and tokenizer.
+
+ Args:
+ prompt: The prompt string to use for generating the samples.
+ num_samples: The number of text samples to generate.
+ max_new_tokens: The number of generation steps to take.
+ top_k: The number of top most probable tokens to consider in the sampling process.
+ temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+ samples.
+ checkpoint_path: The checkpoint path to load.
+ tokenizer_path: The tokenizer path to load.
+ model_size: The model size to load.
+ quantize: Whether to quantize the model and using which method:
+ ``"llm.int8"``: LLM.int8() mode,
+ ``"gptq.int4"``: GPTQ 4-bit mode.
+ """
+ if not checkpoint_path:
+ checkpoint_path = Path(f"./checkpoints/lit-llama/{model_size}/lit-llama.pth")
+ if not tokenizer_path:
+ tokenizer_path = Path("./checkpoints/lit-llama/tokenizer.model")
+ assert checkpoint_path.is_file(), checkpoint_path
+ assert tokenizer_path.is_file(), tokenizer_path
+
+ fabric = L.Fabric(accelerator="cuda", devices=1)
+ dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+
+ print("Loading model ...", file=sys.stderr)
+ t0 = time.time()
+ with EmptyInitOnDevice(
+ device=fabric.device, dtype=dtype, quantization_mode=quantize
+ ):
+ model = LLaMA.from_name(model_size)
+
+ checkpoint = lazy_load(checkpoint_path)
+ model.load_state_dict(checkpoint)
+ print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr)
+
+ model.eval()
+ model = fabric.setup_module(model)
+
+ tokenizer = Tokenizer(tokenizer_path)
+ encoded_prompt = tokenizer.encode(prompt, bos=True, eos=False, device=fabric.device)
+
+ L.seed_everything(1234)
+ t0 = time.perf_counter()
+
+ for _ in range(num_samples):
+ y = generate(
+ model,
+ encoded_prompt,
+ max_new_tokens,
+ model.config.block_size, # type: ignore[union-attr,arg-type]
+ temperature=temperature,
+ top_k=top_k,
+ )
+ print(tokenizer.decode(y))
+
+ t = time.perf_counter() - t0
+ print(f"\n\nTime for inference: {t:.02f} sec total, {num_samples * max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
+ print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
+
+
+if __name__ == "__main__":
+ from jsonargparse import CLI
+
+ torch.set_float32_matmul_precision("high")
+ warnings.filterwarnings(
+ # Triggered internally at ../aten/src/ATen/EmptyTensor.cpp:31
+ "ignore",
+ message="ComplexHalf support is experimental and many operators don't support it yet"
+ )
+ warnings.filterwarnings(
+ # Triggered in bitsandbytes/autograd/_functions.py:298
+ "ignore",
+ message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization",
+ )
+ CLI(main)
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/models/constants.py b/models/constants.py
new file mode 100755
index 0000000..f1bcfae
--- /dev/null
+++ b/models/constants.py
@@ -0,0 +1,18 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+X_TOKEN_INDEX = {'IMAGE': -200, 'VIDEO': -201, 'AUDIO': -202, 'THERMAL': -203, 'DEPTH': -204}
+X_INDEX_TOKEN = {v: k for k, v in X_TOKEN_INDEX.items()}
+# IMAGE_TOKEN_INDEX = -200
+DEFAULT_X_TOKEN = {'IMAGE': "", 'VIDEO': "", 'AUDIO': "", 'THERMAL': "", 'DEPTH': ""}
+# DEFAULT_IMAGE_TOKEN = ""
+DEFAULT_X_PATCH_TOKEN = {'IMAGE': "", 'VIDEO': "", 'AUDIO': "", 'THERMAL': "", 'DEPTH': ""}
+# DEFAULT_IMAGE_PATCH_TOKEN = ""
+DEFAULT_X_START_TOKEN = {'IMAGE': "", 'VIDEO': "", 'AUDIO': "", 'THERMAL': "", 'DEPTH': ""}
+# DEFAULT_IM_START_TOKEN = ""
+DEFAULT_X_END_TOKEN = {'IMAGE': "", 'VIDEO': "", 'AUDIO': "", 'THERMAL': "", 'DEPTH': ""}
+# DEFAULT_IM_END_TOKEN = ""
diff --git a/models/encdec.py b/models/encdec.py
new file mode 100755
index 0000000..ae72afa
--- /dev/null
+++ b/models/encdec.py
@@ -0,0 +1,67 @@
+import torch.nn as nn
+from models.resnet import Resnet1D
+
+class Encoder(nn.Module):
+ def __init__(self,
+ input_emb_width = 3,
+ output_emb_width = 512,
+ down_t = 3,
+ stride_t = 2,
+ width = 512,
+ depth = 3,
+ dilation_growth_rate = 3,
+ activation='relu',
+ norm=None):
+ super().__init__()
+
+ blocks = []
+ filter_t, pad_t = stride_t * 2, stride_t // 2
+ blocks.append(nn.Conv1d(input_emb_width, width, 3, 1, 1))
+ blocks.append(nn.ReLU())
+
+ for i in range(down_t):
+ input_dim = width
+ block = nn.Sequential(
+ nn.Conv1d(input_dim, width, filter_t, stride_t, pad_t),
+ Resnet1D(width, depth, dilation_growth_rate, activation=activation, norm=norm),
+ )
+ blocks.append(block)
+ blocks.append(nn.Conv1d(width, output_emb_width, 3, 1, 1))
+ self.model = nn.Sequential(*blocks)
+
+ def forward(self, x):
+ return self.model(x)
+
+class Decoder(nn.Module):
+ def __init__(self,
+ input_emb_width = 3,
+ output_emb_width = 512,
+ down_t = 3,
+ stride_t = 2,
+ width = 512,
+ depth = 3,
+ dilation_growth_rate = 3,
+ activation='relu',
+ norm=None):
+ super().__init__()
+ blocks = []
+
+ filter_t, pad_t = stride_t * 2, stride_t // 2
+ blocks.append(nn.Conv1d(output_emb_width, width, 3, 1, 1))
+ blocks.append(nn.ReLU())
+ for i in range(down_t):
+ out_dim = width
+ block = nn.Sequential(
+ Resnet1D(width, depth, dilation_growth_rate, reverse_dilation=True, activation=activation, norm=norm),
+ nn.Upsample(scale_factor=2, mode='nearest'),
+ nn.Conv1d(width, out_dim, 3, 1, 1)
+ )
+ blocks.append(block)
+ blocks.append(nn.Conv1d(width, width, 3, 1, 1))
+ blocks.append(nn.ReLU())
+ blocks.append(nn.Conv1d(width, input_emb_width, 3, 1, 1))
+ self.model = nn.Sequential(*blocks)
+
+ def forward(self, x):
+ return self.model(x)
+
diff --git a/models/evaluator_wrapper.py b/models/evaluator_wrapper.py
new file mode 100755
index 0000000..fe4558a
--- /dev/null
+++ b/models/evaluator_wrapper.py
@@ -0,0 +1,92 @@
+
+import torch
+from os.path import join as pjoin
+import numpy as np
+from models.modules import MovementConvEncoder, TextEncoderBiGRUCo, MotionEncoderBiGRUCo
+from utils.word_vectorizer import POS_enumerator
+
+def build_models(opt):
+ movement_enc = MovementConvEncoder(opt.dim_pose-4, opt.dim_movement_enc_hidden, opt.dim_movement_latent)
+ text_enc = TextEncoderBiGRUCo(word_size=opt.dim_word,
+ pos_size=opt.dim_pos_ohot,
+ hidden_size=opt.dim_text_hidden,
+ output_size=opt.dim_coemb_hidden,
+ device=opt.device)
+
+ motion_enc = MotionEncoderBiGRUCo(input_size=opt.dim_movement_latent,
+ hidden_size=opt.dim_motion_hidden,
+ output_size=opt.dim_coemb_hidden,
+ device=opt.device)
+
+ checkpoint = torch.load(pjoin(opt.checkpoints_dir, opt.dataset_name, 'text_mot_match', 'model', 'finest.tar'),
+ map_location=opt.device)
+ movement_enc.load_state_dict(checkpoint['movement_encoder'])
+ text_enc.load_state_dict(checkpoint['text_encoder'])
+ motion_enc.load_state_dict(checkpoint['motion_encoder'])
+ print('Loading Evaluation Model Wrapper (Epoch %d) Completed!!' % (checkpoint['epoch']))
+ return text_enc, motion_enc, movement_enc
+
+
+class EvaluatorModelWrapper(object):
+
+ def __init__(self, opt):
+
+ if opt.dataset_name == 't2m':
+ opt.dim_pose = 263
+ elif opt.dataset_name == 'kit':
+ opt.dim_pose = 251
+ else:
+ raise KeyError('Dataset not Recognized!!!')
+
+ opt.dim_word = 300
+ opt.max_motion_length = 196
+ opt.dim_pos_ohot = len(POS_enumerator)
+ opt.dim_motion_hidden = 1024
+ opt.max_text_len = 20
+ opt.dim_text_hidden = 512
+ opt.dim_coemb_hidden = 512
+
+ # print(opt)
+
+ self.text_encoder, self.motion_encoder, self.movement_encoder = build_models(opt)
+ self.opt = opt
+ self.device = opt.device
+
+ self.text_encoder.to(opt.device)
+ self.motion_encoder.to(opt.device)
+ self.movement_encoder.to(opt.device)
+
+ self.text_encoder.eval()
+ self.motion_encoder.eval()
+ self.movement_encoder.eval()
+
+ # Please note that the results does not following the order of inputs
+ def get_co_embeddings(self, word_embs, pos_ohot, cap_lens, motions, m_lens):
+ with torch.no_grad():
+ word_embs = word_embs.detach().to(self.device).float()
+ pos_ohot = pos_ohot.detach().to(self.device).float()
+ motions = motions.detach().to(self.device).float()
+
+ '''Movement Encoding'''
+ movements = self.movement_encoder(motions[..., :-4]).detach()
+ m_lens = m_lens // self.opt.unit_length
+ motion_embedding = self.motion_encoder(movements, m_lens)
+
+ '''Text Encoding'''
+ text_embedding = self.text_encoder(word_embs, pos_ohot, cap_lens)
+ return text_embedding, motion_embedding
+
+ # Please note that the results does not following the order of inputs
+ def get_motion_embeddings(self, motions, m_lens):
+ with torch.no_grad():
+ motions = motions.detach().to(self.device).float()
+
+ align_idx = np.argsort(m_lens.data.tolist())[::-1].copy()
+ motions = motions[align_idx]
+ m_lens = m_lens[align_idx]
+
+ '''Movement Encoding'''
+ movements = self.movement_encoder(motions[..., :-4]).detach()
+ m_lens = m_lens // self.opt.unit_length
+ motion_embedding = self.motion_encoder(movements, m_lens)
+ return motion_embedding
diff --git a/models/modules.py b/models/modules.py
new file mode 100755
index 0000000..4f06cd9
--- /dev/null
+++ b/models/modules.py
@@ -0,0 +1,109 @@
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+
+def init_weight(m):
+ if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
+ nn.init.xavier_normal_(m.weight)
+ # m.bias.data.fill_(0.01)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+
+
+class MovementConvEncoder(nn.Module):
+ def __init__(self, input_size, hidden_size, output_size):
+ super(MovementConvEncoder, self).__init__()
+ self.main = nn.Sequential(
+ nn.Conv1d(input_size, hidden_size, 4, 2, 1),
+ nn.Dropout(0.2, inplace=True),
+ nn.LeakyReLU(0.2, inplace=True),
+ nn.Conv1d(hidden_size, output_size, 4, 2, 1),
+ nn.Dropout(0.2, inplace=True),
+ nn.LeakyReLU(0.2, inplace=True),
+ )
+ self.out_net = nn.Linear(output_size, output_size)
+ self.main.apply(init_weight)
+ self.out_net.apply(init_weight)
+
+ def forward(self, inputs):
+ inputs = inputs.permute(0, 2, 1)
+ outputs = self.main(inputs).permute(0, 2, 1)
+ # print(outputs.shape)
+ return self.out_net(outputs)
+
+
+
+class TextEncoderBiGRUCo(nn.Module):
+ def __init__(self, word_size, pos_size, hidden_size, output_size, device):
+ super(TextEncoderBiGRUCo, self).__init__()
+ self.device = device
+
+ self.pos_emb = nn.Linear(pos_size, word_size)
+ self.input_emb = nn.Linear(word_size, hidden_size)
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
+ self.output_net = nn.Sequential(
+ nn.Linear(hidden_size * 2, hidden_size),
+ nn.LayerNorm(hidden_size),
+ nn.LeakyReLU(0.2, inplace=True),
+ nn.Linear(hidden_size, output_size)
+ )
+
+ self.input_emb.apply(init_weight)
+ self.pos_emb.apply(init_weight)
+ self.output_net.apply(init_weight)
+ self.hidden_size = hidden_size
+ self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
+
+ # input(batch_size, seq_len, dim)
+ def forward(self, word_embs, pos_onehot, cap_lens):
+ num_samples = word_embs.shape[0]
+
+ pos_embs = self.pos_emb(pos_onehot)
+ inputs = word_embs + pos_embs
+ input_embs = self.input_emb(inputs)
+ hidden = self.hidden.repeat(1, num_samples, 1)
+
+ cap_lens = cap_lens.data.tolist()
+ emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
+
+ gru_seq, gru_last = self.gru(emb, hidden)
+
+ gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+
+ return self.output_net(gru_last)
+
+
+class MotionEncoderBiGRUCo(nn.Module):
+ def __init__(self, input_size, hidden_size, output_size, device):
+ super(MotionEncoderBiGRUCo, self).__init__()
+ self.device = device
+
+ self.input_emb = nn.Linear(input_size, hidden_size)
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
+ self.output_net = nn.Sequential(
+ nn.Linear(hidden_size*2, hidden_size),
+ nn.LayerNorm(hidden_size),
+ nn.LeakyReLU(0.2, inplace=True),
+ nn.Linear(hidden_size, output_size)
+ )
+
+ self.input_emb.apply(init_weight)
+ self.output_net.apply(init_weight)
+ self.hidden_size = hidden_size
+ self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
+
+ # input(batch_size, seq_len, dim)
+ def forward(self, inputs, m_lens):
+ num_samples = inputs.shape[0]
+
+ input_embs = self.input_emb(inputs)
+ hidden = self.hidden.repeat(1, num_samples, 1)
+
+ cap_lens = m_lens.data.tolist()
+ emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True, enforce_sorted=False)
+
+ gru_seq, gru_last = self.gru(emb, hidden)
+
+ gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+
+ return self.output_net(gru_last)
diff --git a/models/multimodal_encoder/builder.py b/models/multimodal_encoder/builder.py
new file mode 100755
index 0000000..85f6eda
--- /dev/null
+++ b/models/multimodal_encoder/builder.py
@@ -0,0 +1,49 @@
+import os
+from .clip_encoder import CLIPVisionTower
+from .languagebind import LanguageBindImageTower, LanguageBindVideoTower
+from .mae_encoder import MAEVisionTower
+from transformers import CLIPModel
+
+def build_image_tower(image_tower_cfg, **kwargs):
+ image_tower = getattr(image_tower_cfg, 'mm_image_tower', getattr(image_tower_cfg, 'image_tower', None))
+ is_absolute_path_exists = os.path.exists(image_tower)
+ if is_absolute_path_exists or image_tower.startswith("openai") or image_tower.startswith("laion"):
+ return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
+ if image_tower.endswith('LanguageBind_Image'):
+ return LanguageBindImageTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
+ if 'mae' in image_tower:
+ print('maemaemaemaemaemaemaemae')
+ print('maemaemaemaemaemaemaemae')
+ print('maemaemaemaemaemaemaemae')
+ print('maemaemaemaemaemaemaemae')
+ print('maemaemaemaemaemaemaemae')
+ return MAEVisionTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
+ raise ValueError(f'Unknown image tower: {image_tower}')
+
+def build_video_tower(video_tower_cfg, **kwargs):
+ video_tower = getattr(video_tower_cfg, 'mm_video_tower', getattr(video_tower_cfg, 'video_tower', None))
+ if video_tower.endswith('LanguageBind_Video_merge'):
+ return LanguageBindVideoTower(video_tower, args=video_tower_cfg, cache_dir='./cache_dir', **kwargs)
+ raise ValueError(f'Unknown video tower: {video_tower}')
+
+
+
+# import os
+# from .clip_encoder import CLIPVisionTower
+# from .languagebind import LanguageBindImageTower, LanguageBindVideoTower
+# from transformers import CLIPModel
+
+# def build_image_tower(image_tower_cfg, **kwargs):
+# image_tower = getattr(image_tower_cfg, 'mm_image_tower', getattr(image_tower_cfg, 'image_tower', None))
+# is_absolute_path_exists = os.path.exists(image_tower)
+# if is_absolute_path_exists or image_tower.startswith("openai") or image_tower.startswith("laion"):
+# return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
+# if image_tower.endswith('LanguageBind_Image'):
+# return LanguageBindImageTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
+# raise ValueError(f'Unknown image tower: {image_tower}')
+
+# def build_video_tower(video_tower_cfg, **kwargs):
+# video_tower = getattr(video_tower_cfg, 'mm_video_tower', getattr(video_tower_cfg, 'video_tower', None))
+# if video_tower.endswith('LanguageBind_Video'):
+# return LanguageBindVideoTower(video_tower, args=video_tower_cfg, cache_dir='./cache_dir', **kwargs)
+# raise ValueError(f'Unknown video tower: {video_tower}')
\ No newline at end of file
diff --git a/models/multimodal_encoder/clip_encoder.py b/models/multimodal_encoder/clip_encoder.py
new file mode 100755
index 0000000..dbb9015
--- /dev/null
+++ b/models/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,78 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+ def __init__(self, vision_tower, args, delay_load=False):
+ super().__init__()
+
+ self.is_loaded = False
+
+ self.vision_tower_name = vision_tower
+ self.select_layer = args.mm_vision_select_layer
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+ if not delay_load:
+ self.load_model()
+ else:
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+ def load_model(self):
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+ self.vision_tower.requires_grad_(False)
+
+ self.is_loaded = True
+
+ def feature_select(self, image_forward_outs):
+ image_features = image_forward_outs.hidden_states[self.select_layer]
+ if self.select_feature == 'patch':
+ image_features = image_features[:, 1:]
+ elif self.select_feature == 'cls_patch':
+ image_features = image_features
+ else:
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
+ return image_features
+
+ @torch.no_grad()
+ def forward(self, images):
+ if type(images) is list:
+ image_features = []
+ for image in images:
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
+ image_features.append(image_feature)
+ else:
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+ return image_features
+
+ @property
+ def dummy_feature(self):
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+ @property
+ def dtype(self):
+ return self.vision_tower.dtype
+
+ @property
+ def device(self):
+ return self.vision_tower.device
+
+ @property
+ def config(self):
+ if self.is_loaded:
+ return self.vision_tower.config
+ else:
+ return self.cfg_only
+
+ @property
+ def hidden_size(self):
+ return self.config.hidden_size
+
+ @property
+ def num_patches(self):
+ return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/models/multimodal_encoder/languagebind/__init__.py b/models/multimodal_encoder/languagebind/__init__.py
new file mode 100755
index 0000000..3a4e50d
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/__init__.py
@@ -0,0 +1,285 @@
+import torch
+from torch import nn
+from transformers import AutoConfig
+
+from .image.configuration_image import LanguageBindImageConfig
+from .image.modeling_image import LanguageBindImage
+from .image.tokenization_image import LanguageBindImageTokenizer
+from .image.processing_image import LanguageBindImageProcessor
+
+from .video.configuration_video import LanguageBindVideoConfig
+from .video.modeling_video import LanguageBindVideo
+from .video.tokenization_video import LanguageBindVideoTokenizer
+from .video.processing_video import LanguageBindVideoProcessor
+
+from .depth.configuration_depth import LanguageBindDepthConfig
+from .depth.modeling_depth import LanguageBindDepth
+from .depth.tokenization_depth import LanguageBindDepthTokenizer
+from .depth.processing_depth import LanguageBindDepthProcessor
+
+from .audio.configuration_audio import LanguageBindAudioConfig
+from .audio.modeling_audio import LanguageBindAudio
+from .audio.tokenization_audio import LanguageBindAudioTokenizer
+from .audio.processing_audio import LanguageBindAudioProcessor
+
+from .thermal.configuration_thermal import LanguageBindThermalConfig
+from .thermal.modeling_thermal import LanguageBindThermal
+from .thermal.tokenization_thermal import LanguageBindThermalTokenizer
+from .thermal.processing_thermal import LanguageBindThermalProcessor
+
+
+
+config_dict = {
+ 'thermal': LanguageBindThermalConfig,
+ 'image': LanguageBindImageConfig,
+ 'video': LanguageBindVideoConfig,
+ 'depth': LanguageBindDepthConfig,
+ 'audio': LanguageBindAudioConfig
+}
+model_dict = {
+ 'thermal': LanguageBindThermal,
+ 'image': LanguageBindImage,
+ 'video': LanguageBindVideo,
+ 'depth': LanguageBindDepth,
+ 'audio': LanguageBindAudio
+}
+transform_dict = {
+ 'video': LanguageBindVideoProcessor,
+ 'audio': LanguageBindAudioProcessor,
+ 'depth': LanguageBindDepthProcessor,
+ 'thermal': LanguageBindThermalProcessor,
+ 'image': LanguageBindImageProcessor,
+}
+
+class LanguageBind(nn.Module):
+ def __init__(self, clip_type=('thermal', 'image', 'video', 'depth', 'audio'), use_temp=True, cache_dir='./cache_dir'):
+ super(LanguageBind, self).__init__()
+ self.use_temp = use_temp
+ self.modality_encoder = {}
+ self.modality_proj = {}
+ self.modality_scale = {}
+ self.modality_config = {}
+ for c in clip_type:
+ pretrained_ckpt = f'LanguageBind/LanguageBind_{c.capitalize()}'
+ model = model_dict[c].from_pretrained(pretrained_ckpt, cache_dir=cache_dir)
+ self.modality_encoder[c] = model.vision_model
+ self.modality_proj[c] = model.visual_projection
+ self.modality_scale[c] = model.logit_scale
+ self.modality_config[c] = model.config
+ self.modality_encoder['language'] = model.text_model
+ self.modality_proj['language'] = model.text_projection
+
+ self.modality_encoder = nn.ModuleDict(self.modality_encoder)
+ self.modality_proj = nn.ModuleDict(self.modality_proj)
+
+ def forward(self, inputs):
+ outputs = {}
+ for key, value in inputs.items():
+ value = self.modality_encoder[key](**value)[1]
+ value = self.modality_proj[key](value)
+ value = value / value.norm(p=2, dim=-1, keepdim=True)
+ if self.use_temp:
+ if key != 'language':
+ value = value * self.modality_scale[key].exp()
+ outputs[key] = value
+ return outputs
+
+def to_device(x, device):
+ out_dict = {k: v.to(device) for k, v in x.items()}
+ return out_dict
+
+
+
+
+class LanguageBindImageTower(nn.Module):
+ def __init__(self, image_tower, args, delay_load=False, cache_dir='./cache_dir'):
+ super().__init__()
+ # import pdb; pdb.set_trace()
+ self.is_loaded = False
+
+ self.image_tower_name = image_tower
+ self.select_layer = args.mm_vision_select_layer
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+ self.cache_dir = cache_dir
+
+ if not delay_load:
+ self.load_model()
+ else:
+ # import pdb; pdb.set_trace()
+ self.cfg_only = LanguageBindImageConfig.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir)
+
+ ############################################################
+ def load_model(self):
+ model = LanguageBindImage.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir)
+ self.image_tower = model.vision_model
+ self.image_tower.requires_grad_(False)
+
+ self.image_processor = LanguageBindImageProcessor(model.config)
+
+ self.is_loaded = True
+
+ def feature_select(self, image_forward_outs):
+ image_features = image_forward_outs.hidden_states[self.select_layer]
+ if self.select_feature == 'patch':
+ image_features = image_features[:, 1:]
+ elif self.select_feature == 'cls_patch':
+ image_features = image_features
+ else:
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
+ return image_features
+
+ @torch.no_grad()
+ def forward(self, images):
+ if type(images) is list:
+ image_features = []
+ for image in images:
+ image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
+ image_features.append(image_feature)
+ else:
+ # print('images', images.shape)
+ image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+ # print('image_forward_outs', len(image_forward_outs), image_forward_outs[0].shape)
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
+ # print('image_features', image_features.shape)
+
+ return image_features
+
+ @property
+ def dummy_feature(self):
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+ @property
+ def dtype(self):
+ return self.image_tower.embeddings.class_embedding.dtype #############
+
+ @property
+ def device(self):
+ return self.image_tower.embeddings.class_embedding.device ##############
+
+ @property
+ def config(self):
+ if self.is_loaded:
+ return self.image_tower.config
+ else:
+ return self.cfg_only
+
+ @property
+ def hidden_size(self):
+ return self.config.hidden_size
+
+ @property
+ def num_patches(self):
+ return (self.config.image_size // self.config.patch_size) ** 2
+
+class temp_model(nn.Module):
+ def __init__(self):
+ super(temp_model, self).__init__()
+ def forward(self, **kwargs):
+ return torch.randn(25, 1, 256, 1024)
+
+
+class LanguageBindVideoTower(nn.Module):
+ def __init__(self, video_tower, args, delay_load=False, cache_dir='./cache_dir'):
+ super().__init__()
+
+ self.is_loaded = False
+
+ self.video_tower_name = video_tower
+ self.select_layer = args.mm_vision_select_layer
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+ self.cache_dir = cache_dir
+
+ if not delay_load:
+ self.load_model()
+ else:
+ self.cfg_only = LanguageBindVideoConfig.from_pretrained(self.video_tower_name, cache_dir=self.cache_dir)
+
+ ## 使用deley load, from_pretrained 之后,self.is_loaded 仍然是false
+ # import pdb; pdb.set_trace()
+
+ ############################################################
+ def load_model(self):
+ model = LanguageBindVideo.from_pretrained(self.video_tower_name, cache_dir=self.cache_dir)
+ self.video_processor = LanguageBindVideoProcessor(model.config)
+
+
+ # model = LanguageBindImage.from_pretrained('LanguageBind/LanguageBind_Image', cache_dir=self.cache_dir)
+ self.video_tower = model.vision_model
+ self.video_tower.requires_grad_(False)
+
+
+ self.is_loaded = True
+
+ # def feature_select(self, image_forward_outs):
+ # image_features = image_forward_outs.hidden_states[self.select_layer]
+ # if self.select_feature == 'patch':
+ # image_features = image_features[:, 1:]
+ # elif self.select_feature == 'cls_patch':
+ # image_features = image_features
+ # else:
+ # raise ValueError(f'Unexpected select feature: {self.select_feature}')
+ # return image_features
+
+ def feature_select(self, video_forward_outs):
+ # print('len(video_forward_outs.hidden_states)', len(video_forward_outs.hidden_states))
+ video_features = video_forward_outs.hidden_states[self.select_layer] # b t n c
+ b, t, n, c = video_features.shape
+ # print('video_features', video_features.shape)
+ if self.select_feature == 'patch':
+ # video_features = video_features[:, 1:]
+ video_features = video_features[:, :, 1:]
+ video_features = video_features.reshape(b, -1, c)
+ elif self.select_feature == 'cls_patch':
+ # video_features = video_features
+ video_features = video_features.reshape(b, -1, c)
+ else:
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
+ return video_features
+
+ @torch.no_grad()
+ def forward(self, videos):
+ # import pdb; pdb.set_trace()
+ if type(videos) is list:
+ video_features = []
+ for video in videos:
+ video_forward_out = self.video_tower(video.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+ video_feature = self.feature_select(video_forward_out).to(video.dtype)
+ video_features.append(video_feature)
+ else:
+ # print(11111111111, videos.shape)
+ video_forward_outs = self.video_tower(videos.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+ video_features = self.feature_select(video_forward_outs).to(videos.dtype)
+
+ return video_features
+
+ @property
+ def dummy_feature(self):
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+ @property
+ def dtype(self):
+ return self.video_tower.embeddings.class_embedding.dtype #############
+ # return torch.randn(1).cuda().dtype
+
+ @property
+ def device(self):
+ return self.video_tower.embeddings.class_embedding.device ##############
+ # return torch.randn(1).cuda().device
+
+ @property
+ def config(self):
+ if self.is_loaded:
+ return self.video_tower.config
+ else:
+ return self.cfg_only
+
+ @property
+ def hidden_size(self):
+ return self.config.hidden_size
+
+ @property
+ def num_patches(self):
+ return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/models/multimodal_encoder/languagebind/audio/configuration_audio.py b/models/multimodal_encoder/languagebind/audio/configuration_audio.py
new file mode 100755
index 0000000..865a496
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/configuration_audio.py
@@ -0,0 +1,430 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 49408):
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`CLIPModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ max_position_embeddings (`int`, *optional*, defaults to 77):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPTextConfig()
+
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPTextModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "clip_text_model"
+
+ def __init__(
+ self,
+ vocab_size=49408,
+ hidden_size=512,
+ intermediate_size=2048,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=8,
+ max_position_embeddings=77,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+ pad_token_id=1,
+ bos_token_id=49406,
+ eos_token_id=49407,
+ **kwargs,
+ ):
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.add_time_attn = False ######################################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the text config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["text_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 32):
+ The size (resolution) of each patch.
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPVisionConfig()
+
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPVisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "clip_vision_model"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ intermediate_size=3072,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ num_channels=3,
+ image_size=224,
+ patch_size=32,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+
+ add_time_attn=False, ################################
+ num_frames=1, ################################
+ force_patch_dropout=0.0, ################################
+ lora_r=2, ################################
+ lora_alpha=16, ################################
+ lora_dropout=0.0, ################################
+ num_mel_bins=0.0, ################################
+ target_length=0.0, ################################
+ video_decode_backend='decord', #########################
+ audio_sample_rate=16000,
+ audio_mean=0.5,
+ audio_std=0.5,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+
+ self.add_time_attn = add_time_attn ################
+ self.num_frames = num_frames ################
+ self.force_patch_dropout = force_patch_dropout ################
+ self.lora_r = lora_r ################
+ self.lora_alpha = lora_alpha ################
+ self.lora_dropout = lora_dropout ################
+ self.num_mel_bins = num_mel_bins ################
+ self.target_length = target_length ################
+ self.video_decode_backend = video_decode_backend ################
+
+ self.audio_sample_rate = audio_sample_rate
+ self.audio_mean = audio_mean
+ self.audio_std = audio_std
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the vision config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["vision_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindAudioConfig(PretrainedConfig):
+ r"""
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+ vision_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+ projection_dim (`int`, *optional*, defaults to 512):
+ Dimentionality of text and vision projection layers.
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+ kwargs (*optional*):
+ Dictionary of keyword arguments.
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPConfig, CLIPModel
+
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPConfig()
+
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+ >>> # Initializing a CLIPText and CLIPVision configuration
+ >>> config_text = CLIPTextConfig()
+ >>> config_vision = CLIPVisionConfig()
+
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+ ```"""
+
+ model_type = "LanguageBindAudio"
+ is_composition = True
+
+ def __init__(
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+ ):
+ # If `_config_dict` exist, we use them for the backward compatibility.
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+ # of confusion!).
+ text_config_dict = kwargs.pop("text_config_dict", None)
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+ super().__init__(**kwargs)
+
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+ if text_config_dict is not None:
+ if text_config is None:
+ text_config = {}
+
+ # This is the complete result when using `text_config_dict`.
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+ for key, value in _text_config_dict.items():
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+ # If specified in `text_config_dict`
+ if key in text_config_dict:
+ message = (
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+ f'The value `text_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+ f'value `text_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
+ text_config.update(_text_config_dict)
+
+ if vision_config_dict is not None:
+ if vision_config is None:
+ vision_config = {}
+
+ # This is the complete result when using `vision_config_dict`.
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+ # convert keys to string instead of integer
+ if "id2label" in _vision_config_dict:
+ _vision_config_dict["id2label"] = {
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
+ }
+
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+ for key, value in _vision_config_dict.items():
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+ # If specified in `vision_config_dict`
+ if key in vision_config_dict:
+ message = (
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+ f'The value `vision_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+ vision_config.update(_vision_config_dict)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+ self.text_config = CLIPTextConfig(**text_config)
+ self.vision_config = CLIPVisionConfig(**vision_config)
+
+ self.projection_dim = projection_dim
+ self.logit_scale_init_value = logit_scale_init_value
+ self.initializer_factor = 1.0
+
+ @classmethod
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+ r"""
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+ configuration.
+
+ Returns:
+ [`CLIPConfig`]: An instance of a configuration object
+ """
+
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+ def to_dict(self):
+ """
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+ Returns:
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ """
+ output = copy.deepcopy(self.__dict__)
+ output["text_config"] = self.text_config.to_dict()
+ output["vision_config"] = self.vision_config.to_dict()
+ output["model_type"] = self.__class__.model_type
+ return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/audio/modeling_audio.py b/models/multimodal_encoder/languagebind/audio/modeling_audio.py
new file mode 100755
index 0000000..908ab43
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/modeling_audio.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+ CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_audio import LanguageBindAudioConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+ """
+ https://arxiv.org/abs/2212.00794
+ """
+
+ def __init__(self, prob, exclude_first_token=True):
+ super().__init__()
+ assert 0 <= prob < 1.
+ self.prob = prob
+ self.exclude_first_token = exclude_first_token # exclude CLS token
+
+ def forward(self, x, B, T):
+ if not self.training or self.prob == 0.:
+ return x
+
+ if self.exclude_first_token:
+ cls_tokens, x = x[:, :1], x[:, 1:]
+ else:
+ cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+ batch = x.size()[0]
+ num_tokens = x.size()[1]
+
+ batch_indices = torch.arange(batch)
+ batch_indices = batch_indices[..., None]
+
+ keep_prob = 1 - self.prob
+ num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+ if T == 1:
+ rand = torch.randn(batch, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ else:
+ rand = torch.randn(B, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+ patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+ x = x[batch_indices, patch_indices_keep]
+
+ if self.exclude_first_token:
+ x = torch.cat((cls_tokens, x), dim=1)
+
+ return x
+
+class CLIPEncoderLayer(nn.Module):
+ def __init__(self, config: LanguageBindAudioConfig):
+ super().__init__()
+ self.embed_dim = config.hidden_size
+ self.self_attn = CLIPAttention(config)
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.mlp = CLIPMLP(config)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ self.add_time_attn = config.add_time_attn
+ if self.add_time_attn:
+ self.t = config.num_frames
+ self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+ nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+ self.embed_dim = config.hidden_size
+ self.temporal_attn = CLIPAttention(config)
+ self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.temporal_mlp = CLIPMLP(config)
+ self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ causal_attention_mask: torch.Tensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+
+
+ if self.add_time_attn:
+ bt, n, d = hidden_states.shape
+ t = self.t
+
+ # time embed
+ if t != 1:
+ n = hidden_states.shape[1]
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+ hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # time attn
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm1(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.temporal_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm2(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm2(hidden_states)
+ hidden_states = self.temporal_mlp(hidden_states)
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # spatial attn
+ residual = hidden_states
+
+ hidden_states = self.layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.layer_norm2(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = LanguageBindAudioConfig
+ base_model_prefix = "clip"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ factor = self.config.initializer_factor
+ if isinstance(module, CLIPTextEmbeddings):
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ elif isinstance(module, CLIPVisionEmbeddings):
+ factor = self.config.initializer_factor
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+ elif isinstance(module, CLIPAttention):
+ factor = self.config.initializer_factor
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ out_proj_std = (module.embed_dim**-0.5) * factor
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+ elif isinstance(module, CLIPMLP):
+ factor = self.config.initializer_factor
+ in_proj_std = (
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ )
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+ nn.init.normal_(module.fc1.weight, std=fc_std)
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
+ elif isinstance(module, LanguageBindAudio):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPVisionModelWithProjection):
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPTextModelWithProjection):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+
+ if isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, CLIPEncoder):
+ module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ return_loss (`bool`, *optional*):
+ Whether or not to return the contrastive loss.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+ """
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`CLIPEncoderLayer`].
+
+ Args:
+ config: CLIPConfig
+ """
+
+ def __init__(self, config: LanguageBindAudioConfig):
+ super().__init__()
+ self.config = config
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask: Optional[torch.Tensor] = None,
+ causal_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ hidden_states = inputs_embeds
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(encoder_layer),
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = CLIPTextEmbeddings(config)
+ self.encoder = CLIPEncoder(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is None:
+ raise ValueError("You have to specify input_ids")
+
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+ pooled_output = last_hidden_state[
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+ ]
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The text model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+ config_class = CLIPTextConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__(config)
+ self.text_model = CLIPTextTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.text_model.embeddings.token_embedding
+
+ def set_input_embeddings(self, value):
+ self.text_model.embeddings.token_embedding = value
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPTextModel
+
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+class CLIPVisionTransformer(nn.Module):
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+
+ self.embeddings = CLIPVisionEmbeddings(config)
+ self.patch_dropout = PatchDropout(config.force_patch_dropout)
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+ self.encoder = CLIPEncoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+ ######################################
+ if len(pixel_values.shape) == 7:
+ b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+ # print(pixel_values.shape)
+ B = b_new * pair_new * bs_new
+ pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+ elif len(pixel_values.shape) == 5:
+ B, _, T, _, _ = pixel_values.shape
+ # print(pixel_values.shape)
+ pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+ else:
+ # print(pixel_values.shape)
+ B, _, _, _ = pixel_values.shape
+ T = 1
+ ###########################
+ hidden_states = self.embeddings(pixel_values)
+
+ hidden_states = self.patch_dropout(hidden_states, B, T) ##############################################
+
+ hidden_states = self.pre_layrnorm(hidden_states)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ pooled_output = last_hidden_state[:, 0, :]
+ pooled_output = self.post_layernorm(pooled_output)
+
+ pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The vision model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+ config_class = CLIPVisionConfig
+ main_input_name = "pixel_values"
+
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__(config)
+ self.vision_model = CLIPVisionTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.vision_model.embeddings.patch_embedding
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPVisionModel
+
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindAudio(CLIPPreTrainedModel):
+ config_class = LanguageBindAudioConfig
+
+ def __init__(self, config: LanguageBindAudioConfig):
+ super().__init__(config)
+
+ if not isinstance(config.text_config, CLIPTextConfig):
+ raise ValueError(
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
+ f" {type(config.text_config)}."
+ )
+
+ if not isinstance(config.vision_config, CLIPVisionConfig):
+ raise ValueError(
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+ f" {type(config.vision_config)}."
+ )
+
+ text_config = config.text_config
+ vision_config = config.vision_config
+ self.add_time_attn = vision_config.add_time_attn
+ self.lora_r = vision_config.lora_r
+ self.lora_alpha = vision_config.lora_alpha
+ self.lora_dropout = vision_config.lora_dropout
+
+ self.projection_dim = config.projection_dim
+ self.text_embed_dim = text_config.hidden_size
+ self.vision_embed_dim = vision_config.hidden_size
+
+ self.text_model = CLIPTextTransformer(text_config)
+ self.vision_model = CLIPVisionTransformer(vision_config)
+
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+ # Initialize weights and apply final processing
+ self.post_init()
+ self.convert_to_lora()
+ self.resize_pos(self.vision_model.embeddings, vision_config)
+
+ def convert_to_lora(self):
+ if self.lora_r == 0:
+ return
+ if self.add_time_attn:
+ target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+ "temporal_attn.q_proj", "temporal_attn.out_proj",
+ "temporal_mlp.fc1", "temporal_mlp.fc2"]
+ else:
+ target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+ config = LoraConfig(
+ r=self.lora_r, # 16
+ lora_alpha=self.lora_alpha, # 16
+ target_modules=target_modules, # self_attn.out_proj
+ lora_dropout=self.lora_dropout, # 0.1
+ bias="none",
+ modules_to_save=[],
+ )
+ self.vision_model.encoder.is_gradient_checkpointing = False
+ self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+ def resize_pos(self, m, vision_config):
+ # convert embedding
+ if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+ m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+ m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+ # pos resize
+ old_pos_embed_state_dict = m.position_embedding.state_dict()
+ old_pos_embed = old_pos_embed_state_dict['weight']
+ dtype = old_pos_embed.dtype
+ grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+ if new_seq_len == old_pos_embed.shape[0]:
+ # m.to(args.device)
+ return
+
+ m.num_patches = grid_size[0] * grid_size[1]
+ m.num_positions = m.num_patches + 1
+ m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+ new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+ if extra_tokens:
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+ else:
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
+ old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+ # if is_master(args):
+ # logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+ pos_emb_img = F.interpolate(
+ pos_emb_img,
+ size=grid_size,
+ mode='bicubic',
+ antialias=True,
+ align_corners=False,
+ )
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+ if pos_emb_tok is not None:
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+ else:
+ new_pos_embed = pos_emb_img
+ old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+ m.position_embedding = new_position_embedding
+ m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+ # m.to(args.device)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ def get_text_features(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = text_outputs[1]
+ text_features = self.text_projection(pooled_output)
+
+ return text_features
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ def get_image_features(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = vision_outputs[1] # pooled_output
+ image_features = self.visual_projection(pooled_output)
+
+ return image_features
+
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindAudioConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CLIPOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+ ... )
+
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ image_embeds = vision_outputs[1]
+ image_embeds = self.visual_projection(image_embeds)
+
+ text_embeds = text_outputs[1]
+ text_embeds = self.text_projection(text_embeds)
+
+ # normalized features
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+ # cosine similarity as logits
+ logit_scale = self.logit_scale.exp()
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+ logits_per_image = logits_per_text.t()
+
+ loss = None
+ if return_loss:
+ loss = clip_loss(logits_per_text)
+
+ if not return_dict:
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+ return ((loss,) + output) if loss is not None else output
+
+ return CLIPOutput(
+ loss=loss,
+ logits_per_image=logits_per_image,
+ logits_per_text=logits_per_text,
+ text_embeds=text_embeds,
+ image_embeds=image_embeds,
+ text_model_output=text_outputs,
+ vision_model_output=vision_outputs,
+ )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/audio/processing_audio.py b/models/multimodal_encoder/languagebind/audio/processing_audio.py
new file mode 100755
index 0000000..8c9baec
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/processing_audio.py
@@ -0,0 +1,190 @@
+import cv2
+import numpy as np
+import torch
+# import torchaudio
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from torch.nn import functional as F
+
+
+def make_list_of_images(x):
+ if not isinstance(x, list):
+ return [x]
+ return x
+
+
+#torchaudio.set_audio_backend("soundfile")
+
+def torchaudio_loader(path):
+ return torchaudio.load(path)
+
+def int16_to_float32_torch(x):
+ return (x / 32767.0).type(torch.float32)
+
+def float32_to_int16_torch(x):
+ x = torch.clamp(x, min=-1., max=1.)
+ return (x * 32767.).type(torch.int16)
+
+DEFAULT_AUDIO_FRAME_SHIFT_MS = 10
+
+class AudioTransform:
+ def __init__(self, config):
+ self.sample_rate = config.audio_sample_rate
+ self.num_mel_bins = config.num_mel_bins
+ self.target_length = config.target_length
+ self.audio_mean = config.audio_mean
+ self.audio_std = config.audio_std
+ # mean=-4.2677393
+ # std=4.5689974
+ self.norm = transforms.Normalize(mean=self.audio_mean, std=self.audio_std)
+
+ def __call__(self, audio_data_and_origin_sr):
+ audio_data, origin_sr = audio_data_and_origin_sr
+ if self.sample_rate != origin_sr:
+ # print(audio_data.shape, origin_sr)
+ audio_data = torchaudio.functional.resample(audio_data, orig_freq=origin_sr, new_freq=self.sample_rate)
+ waveform_melspec = self.waveform2melspec(audio_data[0])
+ return self.norm(waveform_melspec)
+
+ def waveform2melspec(self, audio_data):
+ max_len = self.target_length * self.sample_rate // 100
+ if audio_data.shape[-1] > max_len:
+ mel = self.get_mel(audio_data)
+ # split to three parts
+ chunk_frames = self.target_length
+ total_frames = mel.shape[0]
+ ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+ # print('total_frames-chunk_frames:', total_frames-chunk_frames,
+ # 'len(audio_data):', len(audio_data),
+ # 'chunk_frames:', chunk_frames,
+ # 'total_frames:', total_frames)
+ if len(ranges[1]) == 0: # if the audio is too short, we just use the first chunk
+ ranges[1] = [0]
+ if len(ranges[2]) == 0: # if the audio is too short, we just use the first chunk
+ ranges[2] = [0]
+ # randomly choose index for each part
+ # idx_front = np.random.choice(ranges[0])
+ # idx_middle = np.random.choice(ranges[1])
+ # idx_back = np.random.choice(ranges[2])
+ idx_front = ranges[0][0] # fixed
+ idx_middle = ranges[1][0]
+ idx_back = ranges[2][0]
+ # select mel
+ mel_chunk_front = mel[idx_front:idx_front + chunk_frames, :]
+ mel_chunk_middle = mel[idx_middle:idx_middle + chunk_frames, :]
+ mel_chunk_back = mel[idx_back:idx_back + chunk_frames, :]
+ # stack
+ mel_fusion = torch.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back], dim=0)
+ elif audio_data.shape[-1] < max_len: # padding if too short
+ n_repeat = int(max_len / len(audio_data))
+ audio_data = audio_data.repeat(n_repeat)
+ audio_data = F.pad(
+ audio_data,
+ (0, max_len - len(audio_data)),
+ mode="constant",
+ value=0,
+ )
+ mel = self.get_mel(audio_data)
+ mel_fusion = torch.stack([mel, mel, mel], dim=0)
+ else: # if equal
+ mel = self.get_mel(audio_data)
+ mel_fusion = torch.stack([mel, mel, mel], dim=0)
+
+ # twice check
+ p = self.target_length - mel_fusion.shape[1]
+
+ # if abs(p) / self.target_length > 0.2:
+ # logging.warning(
+ # "Large gap between audio n_frames(%d) and "
+ # "target_length (%d). Is the audio_target_length "
+ # "setting correct?",
+ # mel_fusion.shape[1],
+ # self.target_length,
+ # )
+
+ # cut and pad
+ if p > 0:
+ m = torch.nn.ZeroPad2d((0, 0, 0, p))
+ mel_fusion = m(mel_fusion)
+ elif p < 0:
+ mel_fusion = mel_fusion[:, 0: self.target_length, :]
+
+ mel_fusion = mel_fusion.transpose(1, 2) # [3, target_length, mel_bins] -> [3, mel_bins, target_length]
+ return mel_fusion
+
+ def get_mel(self, audio_data):
+ # mel shape: (n_mels, T)
+ audio_data -= audio_data.mean()
+ mel = torchaudio.compliance.kaldi.fbank(
+ audio_data.unsqueeze(0),
+ htk_compat=True,
+ sample_frequency=self.sample_rate,
+ use_energy=False,
+ window_type="hanning",
+ num_mel_bins=self.num_mel_bins,
+ dither=0.0,
+ frame_length=25,
+ frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
+ )
+ return mel # (T, n_mels)
+
+def get_audio_transform(config):
+ config = config.vision_config
+ return AudioTransform(config)
+
+
+def load_and_transform_audio(
+ audio_path,
+ transform,
+):
+ waveform_and_sr = torchaudio_loader(audio_path)
+ audio_outputs = transform(waveform_and_sr)
+
+ return audio_outputs
+
+class LanguageBindAudioProcessor(ProcessorMixin):
+ attributes = []
+ tokenizer_class = ("LanguageBindAudioTokenizer")
+
+ def __init__(self, config, tokenizer=None, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.transform = get_audio_transform(config)
+ self.image_processor = load_and_transform_audio
+ self.tokenizer = tokenizer
+
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+ if text is None and images is None:
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+ if text is not None:
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+ truncation=True, return_tensors=return_tensors, **kwargs)
+
+ if images is not None:
+ images = make_list_of_images(images)
+ image_features = [self.image_processor(image, self.transform) for image in images]
+ image_features = torch.stack(image_features)
+
+ if text is not None and images is not None:
+ encoding["pixel_values"] = image_features
+ return encoding
+ elif text is not None:
+ return encoding
+ else:
+ return {"pixel_values": image_features}
+
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/audio/tokenization_audio.py b/models/multimodal_encoder/languagebind/audio/tokenization_audio.py
new file mode 100755
index 0000000..6bc40be
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/tokenization_audio.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "lb203/LanguageBind-Audio": "https://huggingface.co/lb203/LanguageBind-Audio/resolve/main/vocab.json",
+ },
+ "merges_file": {
+ "lb203/LanguageBind-Audio": "https://huggingface.co/lb203/LanguageBind-Audio/resolve/main/merges.txt",
+ },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+ "lb203/LanguageBind-Audio": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+ "lb203/LanguageBind-Audio": {},
+}
+
+class LanguageBindAudioTokenizer(CLIPTokenizer):
+ """
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ unk_token="<|endoftext|>",
+ bos_token="<|startoftext|>",
+ eos_token="<|endoftext|>",
+ pad_token="<|endoftext|>", # hack to enable padding
+ **kwargs,
+ ):
+ super(LanguageBindAudioTokenizer, self).__init__(
+ vocab_file,
+ merges_file,
+ errors,
+ unk_token,
+ bos_token,
+ eos_token,
+ pad_token, # hack to enable padding
+ **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/depth/configuration_depth.py b/models/multimodal_encoder/languagebind/depth/configuration_depth.py
new file mode 100755
index 0000000..0d3901b
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/configuration_depth.py
@@ -0,0 +1,425 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 49408):
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`CLIPModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ max_position_embeddings (`int`, *optional*, defaults to 77):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPTextConfig()
+
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPTextModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "clip_text_model"
+
+ def __init__(
+ self,
+ vocab_size=49408,
+ hidden_size=512,
+ intermediate_size=2048,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=8,
+ max_position_embeddings=77,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+ pad_token_id=1,
+ bos_token_id=49406,
+ eos_token_id=49407,
+ **kwargs,
+ ):
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.add_time_attn = False ######################################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the text config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["text_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 32):
+ The size (resolution) of each patch.
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPVisionConfig()
+
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPVisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "clip_vision_model"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ intermediate_size=3072,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ num_channels=3,
+ image_size=224,
+ patch_size=32,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+
+ add_time_attn=False, ################################
+ num_frames=1, ################################
+ force_patch_dropout=0.0, ################################
+ lora_r=2, ################################
+ lora_alpha=16, ################################
+ lora_dropout=0.0, ################################
+ num_mel_bins=0.0, ################################
+ target_length=0.0, ################################
+ max_depth=10,
+ video_decode_backend='decord', #########################
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+
+ self.add_time_attn = add_time_attn ################
+ self.num_frames = num_frames ################
+ self.force_patch_dropout = force_patch_dropout ################
+ self.lora_r = lora_r ################
+ self.lora_alpha = lora_alpha ################
+ self.lora_dropout = lora_dropout ################
+ self.num_mel_bins = num_mel_bins ################
+ self.target_length = target_length ################
+ self.max_depth = max_depth ################
+ self.video_decode_backend = video_decode_backend ################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the vision config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["vision_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindDepthConfig(PretrainedConfig):
+ r"""
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+ vision_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+ projection_dim (`int`, *optional*, defaults to 512):
+ Dimentionality of text and vision projection layers.
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+ kwargs (*optional*):
+ Dictionary of keyword arguments.
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPConfig, CLIPModel
+
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPConfig()
+
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+ >>> # Initializing a CLIPText and CLIPVision configuration
+ >>> config_text = CLIPTextConfig()
+ >>> config_vision = CLIPVisionConfig()
+
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+ ```"""
+
+ model_type = "LanguageBindDepth"
+ is_composition = True
+
+ def __init__(
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+ ):
+ # If `_config_dict` exist, we use them for the backward compatibility.
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+ # of confusion!).
+ text_config_dict = kwargs.pop("text_config_dict", None)
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+ super().__init__(**kwargs)
+
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+ if text_config_dict is not None:
+ if text_config is None:
+ text_config = {}
+
+ # This is the complete result when using `text_config_dict`.
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+ for key, value in _text_config_dict.items():
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+ # If specified in `text_config_dict`
+ if key in text_config_dict:
+ message = (
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+ f'The value `text_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+ f'value `text_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
+ text_config.update(_text_config_dict)
+
+ if vision_config_dict is not None:
+ if vision_config is None:
+ vision_config = {}
+
+ # This is the complete result when using `vision_config_dict`.
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+ # convert keys to string instead of integer
+ if "id2label" in _vision_config_dict:
+ _vision_config_dict["id2label"] = {
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
+ }
+
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+ for key, value in _vision_config_dict.items():
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+ # If specified in `vision_config_dict`
+ if key in vision_config_dict:
+ message = (
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+ f'The value `vision_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+ vision_config.update(_vision_config_dict)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+ self.text_config = CLIPTextConfig(**text_config)
+ self.vision_config = CLIPVisionConfig(**vision_config)
+
+ self.projection_dim = projection_dim
+ self.logit_scale_init_value = logit_scale_init_value
+ self.initializer_factor = 1.0
+
+ @classmethod
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+ r"""
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+ configuration.
+
+ Returns:
+ [`CLIPConfig`]: An instance of a configuration object
+ """
+
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+ def to_dict(self):
+ """
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+ Returns:
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ """
+ output = copy.deepcopy(self.__dict__)
+ output["text_config"] = self.text_config.to_dict()
+ output["vision_config"] = self.vision_config.to_dict()
+ output["model_type"] = self.__class__.model_type
+ return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/depth/modeling_depth.py b/models/multimodal_encoder/languagebind/depth/modeling_depth.py
new file mode 100755
index 0000000..849eade
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/modeling_depth.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+ CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_depth import LanguageBindDepthConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+ """
+ https://arxiv.org/abs/2212.00794
+ """
+
+ def __init__(self, prob, exclude_first_token=True):
+ super().__init__()
+ assert 0 <= prob < 1.
+ self.prob = prob
+ self.exclude_first_token = exclude_first_token # exclude CLS token
+
+ def forward(self, x, B, T):
+ if not self.training or self.prob == 0.:
+ return x
+
+ if self.exclude_first_token:
+ cls_tokens, x = x[:, :1], x[:, 1:]
+ else:
+ cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+ batch = x.size()[0]
+ num_tokens = x.size()[1]
+
+ batch_indices = torch.arange(batch)
+ batch_indices = batch_indices[..., None]
+
+ keep_prob = 1 - self.prob
+ num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+ if T == 1:
+ rand = torch.randn(batch, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ else:
+ rand = torch.randn(B, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+ patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+ x = x[batch_indices, patch_indices_keep]
+
+ if self.exclude_first_token:
+ x = torch.cat((cls_tokens, x), dim=1)
+
+ return x
+
+class CLIPEncoderLayer(nn.Module):
+ def __init__(self, config: LanguageBindDepthConfig):
+ super().__init__()
+ self.embed_dim = config.hidden_size
+ self.self_attn = CLIPAttention(config)
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.mlp = CLIPMLP(config)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ self.add_time_attn = config.add_time_attn
+ if self.add_time_attn:
+ self.t = config.num_frames
+ self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+ nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+ self.embed_dim = config.hidden_size
+ self.temporal_attn = CLIPAttention(config)
+ self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.temporal_mlp = CLIPMLP(config)
+ self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ causal_attention_mask: torch.Tensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+
+
+ if self.add_time_attn:
+ bt, n, d = hidden_states.shape
+ t = self.t
+
+ # time embed
+ if t != 1:
+ n = hidden_states.shape[1]
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+ hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # time attn
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm1(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.temporal_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm2(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm2(hidden_states)
+ hidden_states = self.temporal_mlp(hidden_states)
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # spatial attn
+ residual = hidden_states
+
+ hidden_states = self.layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.layer_norm2(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = LanguageBindDepthConfig
+ base_model_prefix = "clip"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ factor = self.config.initializer_factor
+ if isinstance(module, CLIPTextEmbeddings):
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ elif isinstance(module, CLIPVisionEmbeddings):
+ factor = self.config.initializer_factor
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+ elif isinstance(module, CLIPAttention):
+ factor = self.config.initializer_factor
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ out_proj_std = (module.embed_dim**-0.5) * factor
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+ elif isinstance(module, CLIPMLP):
+ factor = self.config.initializer_factor
+ in_proj_std = (
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ )
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+ nn.init.normal_(module.fc1.weight, std=fc_std)
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
+ elif isinstance(module, LanguageBindDepth):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPVisionModelWithProjection):
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPTextModelWithProjection):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+
+ if isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, CLIPEncoder):
+ module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ return_loss (`bool`, *optional*):
+ Whether or not to return the contrastive loss.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+ """
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`CLIPEncoderLayer`].
+
+ Args:
+ config: CLIPConfig
+ """
+
+ def __init__(self, config: LanguageBindDepthConfig):
+ super().__init__()
+ self.config = config
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask: Optional[torch.Tensor] = None,
+ causal_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ hidden_states = inputs_embeds
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(encoder_layer),
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = CLIPTextEmbeddings(config)
+ self.encoder = CLIPEncoder(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is None:
+ raise ValueError("You have to specify input_ids")
+
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+ pooled_output = last_hidden_state[
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+ ]
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The text model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+ config_class = CLIPTextConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__(config)
+ self.text_model = CLIPTextTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.text_model.embeddings.token_embedding
+
+ def set_input_embeddings(self, value):
+ self.text_model.embeddings.token_embedding = value
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPTextModel
+
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+class CLIPVisionTransformer(nn.Module):
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+
+ self.embeddings = CLIPVisionEmbeddings(config)
+ self.patch_dropout = PatchDropout(config.force_patch_dropout)
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+ self.encoder = CLIPEncoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+ ######################################
+ if len(pixel_values.shape) == 7:
+ b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+ # print(pixel_values.shape)
+ B = b_new * pair_new * bs_new
+ pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+ elif len(pixel_values.shape) == 5:
+ B, _, T, _, _ = pixel_values.shape
+ # print(pixel_values.shape)
+ pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+ else:
+ # print(pixel_values.shape)
+ B, _, _, _ = pixel_values.shape
+ T = 1
+ ###########################
+ hidden_states = self.embeddings(pixel_values)
+
+ hidden_states = self.patch_dropout(hidden_states, B, T) ##############################################
+
+ hidden_states = self.pre_layrnorm(hidden_states)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ pooled_output = last_hidden_state[:, 0, :]
+ pooled_output = self.post_layernorm(pooled_output)
+
+ pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The vision model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+ config_class = CLIPVisionConfig
+ main_input_name = "pixel_values"
+
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__(config)
+ self.vision_model = CLIPVisionTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.vision_model.embeddings.patch_embedding
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPVisionModel
+
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindDepth(CLIPPreTrainedModel):
+ config_class = LanguageBindDepthConfig
+
+ def __init__(self, config: LanguageBindDepthConfig):
+ super().__init__(config)
+
+ if not isinstance(config.text_config, CLIPTextConfig):
+ raise ValueError(
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
+ f" {type(config.text_config)}."
+ )
+
+ if not isinstance(config.vision_config, CLIPVisionConfig):
+ raise ValueError(
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+ f" {type(config.vision_config)}."
+ )
+
+ text_config = config.text_config
+ vision_config = config.vision_config
+ self.add_time_attn = vision_config.add_time_attn
+ self.lora_r = vision_config.lora_r
+ self.lora_alpha = vision_config.lora_alpha
+ self.lora_dropout = vision_config.lora_dropout
+
+ self.projection_dim = config.projection_dim
+ self.text_embed_dim = text_config.hidden_size
+ self.vision_embed_dim = vision_config.hidden_size
+
+ self.text_model = CLIPTextTransformer(text_config)
+ self.vision_model = CLIPVisionTransformer(vision_config)
+
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+ # Initialize weights and apply final processing
+ self.post_init()
+ self.convert_to_lora()
+ self.resize_pos(self.vision_model.embeddings, vision_config)
+
+ def convert_to_lora(self):
+ if self.lora_r == 0:
+ return
+ if self.add_time_attn:
+ target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+ "temporal_attn.q_proj", "temporal_attn.out_proj",
+ "temporal_mlp.fc1", "temporal_mlp.fc2"]
+ else:
+ target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+ config = LoraConfig(
+ r=self.lora_r, # 16
+ lora_alpha=self.lora_alpha, # 16
+ target_modules=target_modules, # self_attn.out_proj
+ lora_dropout=self.lora_dropout, # 0.1
+ bias="none",
+ modules_to_save=[],
+ )
+ self.vision_model.encoder.is_gradient_checkpointing = False
+ self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+ def resize_pos(self, m, vision_config):
+ # convert embedding
+ if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+ m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+ m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+ # pos resize
+ old_pos_embed_state_dict = m.position_embedding.state_dict()
+ old_pos_embed = old_pos_embed_state_dict['weight']
+ dtype = old_pos_embed.dtype
+ grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+ if new_seq_len == old_pos_embed.shape[0]:
+ # m.to(args.device)
+ return
+
+ m.num_patches = grid_size[0] * grid_size[1]
+ m.num_positions = m.num_patches + 1
+ m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+ new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+ if extra_tokens:
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+ else:
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
+ old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+ # if is_master(args):
+ # logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+ pos_emb_img = F.interpolate(
+ pos_emb_img,
+ size=grid_size,
+ mode='bicubic',
+ antialias=True,
+ align_corners=False,
+ )
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+ if pos_emb_tok is not None:
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+ else:
+ new_pos_embed = pos_emb_img
+ old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+ m.position_embedding = new_position_embedding
+ m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+ # m.to(args.device)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ def get_text_features(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = text_outputs[1]
+ text_features = self.text_projection(pooled_output)
+
+ return text_features
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ def get_image_features(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = vision_outputs[1] # pooled_output
+ image_features = self.visual_projection(pooled_output)
+
+ return image_features
+
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindDepthConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CLIPOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+ ... )
+
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ image_embeds = vision_outputs[1]
+ image_embeds = self.visual_projection(image_embeds)
+
+ text_embeds = text_outputs[1]
+ text_embeds = self.text_projection(text_embeds)
+
+ # normalized features
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+ # cosine similarity as logits
+ logit_scale = self.logit_scale.exp()
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+ logits_per_image = logits_per_text.t()
+
+ loss = None
+ if return_loss:
+ loss = clip_loss(logits_per_text)
+
+ if not return_dict:
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+ return ((loss,) + output) if loss is not None else output
+
+ return CLIPOutput(
+ loss=loss,
+ logits_per_image=logits_per_image,
+ logits_per_text=logits_per_text,
+ text_embeds=text_embeds,
+ image_embeds=image_embeds,
+ text_model_output=text_outputs,
+ vision_model_output=vision_outputs,
+ )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/depth/processing_depth.py b/models/multimodal_encoder/languagebind/depth/processing_depth.py
new file mode 100755
index 0000000..1019e0c
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/processing_depth.py
@@ -0,0 +1,108 @@
+import cv2
+import torch
+from PIL import Image
+from torch import nn
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+ if not isinstance(x, list):
+ return [x]
+ return x
+
+def opencv_loader(path):
+ return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype('float32')
+
+
+class DepthNorm(nn.Module):
+ def __init__(
+ self,
+ max_depth=0,
+ min_depth=0.01,
+ ):
+ super().__init__()
+ self.max_depth = max_depth
+ self.min_depth = min_depth
+ self.scale = 1000.0 # nyuv2 abs.depth
+
+ def forward(self, image):
+ # image = np.array(image)
+ depth_img = image / self.scale # (H, W) in meters
+ depth_img = depth_img.clip(min=self.min_depth)
+ if self.max_depth != 0:
+ depth_img = depth_img.clip(max=self.max_depth)
+ depth_img /= self.max_depth # 0-1
+ else:
+ depth_img /= depth_img.max()
+ depth_img = torch.from_numpy(depth_img).unsqueeze(0).repeat(3, 1, 1) # assume image
+ return depth_img.to(torch.get_default_dtype())
+
+def get_depth_transform(config):
+ config = config.vision_config
+ transform = transforms.Compose(
+ [
+ DepthNorm(max_depth=config.max_depth),
+ transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+ transforms.CenterCrop(224),
+ transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD), # assume image
+ # transforms.Normalize((0.5, ), (0.5, )) # 0-1 to norm distribution
+ # transforms.Normalize((0.0418, ), (0.0295, )) # sun rgb-d imagebind
+ # transforms.Normalize((0.02, ), (0.00295, )) # nyuv2
+ ]
+ )
+ return transform
+
+def load_and_transform_depth(depth_path, transform):
+ depth = opencv_loader(depth_path)
+ depth_outputs = transform(depth)
+ return depth_outputs
+
+class LanguageBindDepthProcessor(ProcessorMixin):
+ attributes = []
+ tokenizer_class = ("LanguageBindDepthTokenizer")
+
+ def __init__(self, config, tokenizer=None, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.transform = get_depth_transform(config)
+ self.image_processor = load_and_transform_depth
+ self.tokenizer = tokenizer
+
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+ if text is None and images is None:
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+ if text is not None:
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+ truncation=True, return_tensors=return_tensors, **kwargs)
+
+ if images is not None:
+ images = make_list_of_images(images)
+ image_features = [self.image_processor(image, self.transform) for image in images]
+ image_features = torch.stack(image_features)
+
+ if text is not None and images is not None:
+ encoding["pixel_values"] = image_features
+ return encoding
+ elif text is not None:
+ return encoding
+ else:
+ return {"pixel_values": image_features}
+
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/depth/tokenization_depth.py b/models/multimodal_encoder/languagebind/depth/tokenization_depth.py
new file mode 100755
index 0000000..eda9905
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/tokenization_depth.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/vocab.json",
+ },
+ "merges_file": {
+ "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/merges.txt",
+ },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+ "lb203/LanguageBind-Depth": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+ "lb203/LanguageBind-Thermal": {},
+}
+
+class LanguageBindDepthTokenizer(CLIPTokenizer):
+ """
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ unk_token="<|endoftext|>",
+ bos_token="<|startoftext|>",
+ eos_token="<|endoftext|>",
+ pad_token="<|endoftext|>", # hack to enable padding
+ **kwargs,
+ ):
+ super(LanguageBindDepthTokenizer, self).__init__(
+ vocab_file,
+ merges_file,
+ errors,
+ unk_token,
+ bos_token,
+ eos_token,
+ pad_token, # hack to enable padding
+ **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/image/configuration_image.py b/models/multimodal_encoder/languagebind/image/configuration_image.py
new file mode 100755
index 0000000..c1c7b0f
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/configuration_image.py
@@ -0,0 +1,423 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 49408):
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`CLIPModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ max_position_embeddings (`int`, *optional*, defaults to 77):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPTextConfig()
+
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPTextModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "clip_text_model"
+
+ def __init__(
+ self,
+ vocab_size=49408,
+ hidden_size=512,
+ intermediate_size=2048,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=8,
+ max_position_embeddings=77,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+ pad_token_id=1,
+ bos_token_id=49406,
+ eos_token_id=49407,
+ **kwargs,
+ ):
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.add_time_attn = False ######################################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the text config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["text_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 32):
+ The size (resolution) of each patch.
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPVisionConfig()
+
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPVisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "clip_vision_model"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ intermediate_size=3072,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ num_channels=3,
+ image_size=224,
+ patch_size=32,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+
+ add_time_attn=False, ################################
+ num_frames=1, ################################
+ force_patch_dropout=0.0, ################################
+ lora_r=2, ################################
+ lora_alpha=16, ################################
+ lora_dropout=0.0, ################################
+ num_mel_bins=0.0, ################################
+ target_length=0.0, ################################
+ video_decode_backend='decord', #########################
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+
+ self.add_time_attn = add_time_attn ################
+ self.num_frames = num_frames ################
+ self.force_patch_dropout = force_patch_dropout ################
+ self.lora_r = lora_r ################
+ self.lora_alpha = lora_alpha ################
+ self.lora_dropout = lora_dropout ################
+ self.num_mel_bins = num_mel_bins ################
+ self.target_length = target_length ################
+ self.video_decode_backend = video_decode_backend ################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the vision config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["vision_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindImageConfig(PretrainedConfig):
+ r"""
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+ vision_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+ projection_dim (`int`, *optional*, defaults to 512):
+ Dimentionality of text and vision projection layers.
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+ kwargs (*optional*):
+ Dictionary of keyword arguments.
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPConfig, CLIPModel
+
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPConfig()
+
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+ >>> # Initializing a CLIPText and CLIPVision configuration
+ >>> config_text = CLIPTextConfig()
+ >>> config_vision = CLIPVisionConfig()
+
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+ ```"""
+
+ model_type = "LanguageBindImage"
+ is_composition = True
+
+ def __init__(
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+ ):
+ # If `_config_dict` exist, we use them for the backward compatibility.
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+ # of confusion!).
+ text_config_dict = kwargs.pop("text_config_dict", None)
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+ super().__init__(**kwargs)
+
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+ if text_config_dict is not None:
+ if text_config is None:
+ text_config = {}
+
+ # This is the complete result when using `text_config_dict`.
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+ for key, value in _text_config_dict.items():
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+ # If specified in `text_config_dict`
+ if key in text_config_dict:
+ message = (
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+ f'The value `text_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+ f'value `text_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
+ text_config.update(_text_config_dict)
+
+ if vision_config_dict is not None:
+ if vision_config is None:
+ vision_config = {}
+
+ # This is the complete result when using `vision_config_dict`.
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+ # convert keys to string instead of integer
+ if "id2label" in _vision_config_dict:
+ _vision_config_dict["id2label"] = {
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
+ }
+
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+ for key, value in _vision_config_dict.items():
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+ # If specified in `vision_config_dict`
+ if key in vision_config_dict:
+ message = (
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+ f'The value `vision_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+ vision_config.update(_vision_config_dict)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+ self.text_config = CLIPTextConfig(**text_config)
+ self.vision_config = CLIPVisionConfig(**vision_config)
+
+ self.projection_dim = projection_dim
+ self.logit_scale_init_value = logit_scale_init_value
+ self.initializer_factor = 1.0
+
+ @classmethod
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+ r"""
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+ configuration.
+
+ Returns:
+ [`CLIPConfig`]: An instance of a configuration object
+ """
+
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+ def to_dict(self):
+ """
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+ Returns:
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ """
+ output = copy.deepcopy(self.__dict__)
+ output["text_config"] = self.text_config.to_dict()
+ output["vision_config"] = self.vision_config.to_dict()
+ output["model_type"] = self.__class__.model_type
+ return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/image/modeling_image.py b/models/multimodal_encoder/languagebind/image/modeling_image.py
new file mode 100755
index 0000000..e95ac47
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/modeling_image.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+ CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_image import LanguageBindImageConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+ """
+ https://arxiv.org/abs/2212.00794
+ """
+
+ def __init__(self, prob, exclude_first_token=True):
+ super().__init__()
+ assert 0 <= prob < 1.
+ self.prob = prob
+ self.exclude_first_token = exclude_first_token # exclude CLS token
+
+ def forward(self, x, B, T):
+ if not self.training or self.prob == 0.:
+ return x
+
+ if self.exclude_first_token:
+ cls_tokens, x = x[:, :1], x[:, 1:]
+ else:
+ cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+ batch = x.size()[0]
+ num_tokens = x.size()[1]
+
+ batch_indices = torch.arange(batch)
+ batch_indices = batch_indices[..., None]
+
+ keep_prob = 1 - self.prob
+ num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+ if T == 1:
+ rand = torch.randn(batch, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ else:
+ rand = torch.randn(B, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+ patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+ x = x[batch_indices, patch_indices_keep]
+
+ if self.exclude_first_token:
+ x = torch.cat((cls_tokens, x), dim=1)
+
+ return x
+
+class CLIPEncoderLayer(nn.Module):
+ def __init__(self, config: LanguageBindImageConfig):
+ super().__init__()
+ self.embed_dim = config.hidden_size
+ self.self_attn = CLIPAttention(config)
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.mlp = CLIPMLP(config)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ self.add_time_attn = config.add_time_attn
+ if self.add_time_attn:
+ self.t = config.num_frames
+ self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+ nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+ self.embed_dim = config.hidden_size
+ self.temporal_attn = CLIPAttention(config)
+ self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.temporal_mlp = CLIPMLP(config)
+ self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ causal_attention_mask: torch.Tensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+
+
+ if self.add_time_attn:
+ bt, n, d = hidden_states.shape
+ t = self.t
+
+ # time embed
+ if t != 1:
+ n = hidden_states.shape[1]
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+ hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # time attn
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm1(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.temporal_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm2(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm2(hidden_states)
+ hidden_states = self.temporal_mlp(hidden_states)
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # spatial attn
+ residual = hidden_states
+
+ hidden_states = self.layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.layer_norm2(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = LanguageBindImageConfig
+ base_model_prefix = "clip"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ factor = self.config.initializer_factor
+ if isinstance(module, CLIPTextEmbeddings):
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ elif isinstance(module, CLIPVisionEmbeddings):
+ factor = self.config.initializer_factor
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+ elif isinstance(module, CLIPAttention):
+ factor = self.config.initializer_factor
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ out_proj_std = (module.embed_dim**-0.5) * factor
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+ elif isinstance(module, CLIPMLP):
+ factor = self.config.initializer_factor
+ in_proj_std = (
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ )
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+ nn.init.normal_(module.fc1.weight, std=fc_std)
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
+ elif isinstance(module, LanguageBindImage):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPVisionModelWithProjection):
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPTextModelWithProjection):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+
+ if isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, CLIPEncoder):
+ module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ return_loss (`bool`, *optional*):
+ Whether or not to return the contrastive loss.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+ """
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`CLIPEncoderLayer`].
+
+ Args:
+ config: CLIPConfig
+ """
+
+ def __init__(self, config: LanguageBindImageConfig):
+ super().__init__()
+ self.config = config
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask: Optional[torch.Tensor] = None,
+ causal_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ hidden_states = inputs_embeds
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(encoder_layer),
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = CLIPTextEmbeddings(config)
+ self.encoder = CLIPEncoder(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is None:
+ raise ValueError("You have to specify input_ids")
+
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+ pooled_output = last_hidden_state[
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+ ]
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The text model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+ config_class = CLIPTextConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__(config)
+ self.text_model = CLIPTextTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.text_model.embeddings.token_embedding
+
+ def set_input_embeddings(self, value):
+ self.text_model.embeddings.token_embedding = value
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPTextModel
+
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+class CLIPVisionTransformer(nn.Module):
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+
+ self.embeddings = CLIPVisionEmbeddings(config)
+ self.patch_dropout = PatchDropout(config.force_patch_dropout)
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+ self.encoder = CLIPEncoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+ ######################################
+ if len(pixel_values.shape) == 7:
+ b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+ # print(pixel_values.shape)
+ B = b_new * pair_new * bs_new
+ pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+ elif len(pixel_values.shape) == 5:
+ B, _, T, _, _ = pixel_values.shape
+ # print(pixel_values.shape)
+ pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+ else:
+ # print(pixel_values.shape)
+ B, _, _, _ = pixel_values.shape
+ T = 1
+ ###########################
+ hidden_states = self.embeddings(pixel_values)
+
+ hidden_states = self.patch_dropout(hidden_states, B, T) ##############################################
+
+ hidden_states = self.pre_layrnorm(hidden_states)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ pooled_output = last_hidden_state[:, 0, :]
+ pooled_output = self.post_layernorm(pooled_output)
+
+ pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The vision model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+ config_class = CLIPVisionConfig
+ main_input_name = "pixel_values"
+
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__(config)
+ self.vision_model = CLIPVisionTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.vision_model.embeddings.patch_embedding
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPVisionModel
+
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindImage(CLIPPreTrainedModel):
+ config_class = LanguageBindImageConfig
+
+ def __init__(self, config: LanguageBindImageConfig):
+ super().__init__(config)
+
+ if not isinstance(config.text_config, CLIPTextConfig):
+ raise ValueError(
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
+ f" {type(config.text_config)}."
+ )
+
+ if not isinstance(config.vision_config, CLIPVisionConfig):
+ raise ValueError(
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+ f" {type(config.vision_config)}."
+ )
+
+ text_config = config.text_config
+ vision_config = config.vision_config
+ self.add_time_attn = vision_config.add_time_attn
+ self.lora_r = vision_config.lora_r
+ self.lora_alpha = vision_config.lora_alpha
+ self.lora_dropout = vision_config.lora_dropout
+
+ self.projection_dim = config.projection_dim
+ self.text_embed_dim = text_config.hidden_size
+ self.vision_embed_dim = vision_config.hidden_size
+
+ self.text_model = CLIPTextTransformer(text_config)
+ self.vision_model = CLIPVisionTransformer(vision_config)
+
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+ # Initialize weights and apply final processing
+ self.post_init()
+ self.convert_to_lora()
+ # self.resize_pos(self.vision_model.embeddings, vision_config)
+
+ def convert_to_lora(self):
+ if self.lora_r == 0:
+ return
+ if self.add_time_attn:
+ target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+ "temporal_attn.q_proj", "temporal_attn.out_proj",
+ "temporal_mlp.fc1", "temporal_mlp.fc2"]
+ else:
+ target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+ config = LoraConfig(
+ r=self.lora_r, # 16
+ lora_alpha=self.lora_alpha, # 16
+ target_modules=target_modules, # self_attn.out_proj
+ lora_dropout=self.lora_dropout, # 0.1
+ bias="none",
+ modules_to_save=[],
+ )
+ self.vision_model.encoder.is_gradient_checkpointing = False
+ self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+ def resize_pos(self, m, vision_config):
+ # convert embedding
+ if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+ m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+ m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+ # pos resize
+ old_pos_embed_state_dict = m.position_embedding.state_dict()
+ old_pos_embed = old_pos_embed_state_dict['weight']
+ dtype = old_pos_embed.dtype
+ grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+ if new_seq_len == old_pos_embed.shape[0]:
+ # m.to(args.device)
+ return
+
+ m.num_patches = grid_size[0] * grid_size[1]
+ m.num_positions = m.num_patches + 1
+ m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+ new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+ if extra_tokens:
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+ else:
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
+ old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+ # if is_master(args):
+ # logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+ pos_emb_img = F.interpolate(
+ pos_emb_img,
+ size=grid_size,
+ mode='bicubic',
+ antialias=True,
+ align_corners=False,
+ )
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+ if pos_emb_tok is not None:
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+ else:
+ new_pos_embed = pos_emb_img
+ old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+ m.position_embedding = new_position_embedding
+ m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+ # m.to(args.device)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ def get_text_features(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = text_outputs[1]
+ text_features = self.text_projection(pooled_output)
+
+ return text_features
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ def get_image_features(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = vision_outputs[1] # pooled_output
+ image_features = self.visual_projection(pooled_output)
+
+ return image_features
+
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindImageConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CLIPOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+ ... )
+
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ image_embeds = vision_outputs[1]
+ image_embeds = self.visual_projection(image_embeds)
+
+ text_embeds = text_outputs[1]
+ text_embeds = self.text_projection(text_embeds)
+
+ # normalized features
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+ # cosine similarity as logits
+ logit_scale = self.logit_scale.exp()
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+ logits_per_image = logits_per_text.t()
+
+ loss = None
+ if return_loss:
+ loss = clip_loss(logits_per_text)
+
+ if not return_dict:
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+ return ((loss,) + output) if loss is not None else output
+
+ return CLIPOutput(
+ loss=loss,
+ logits_per_image=logits_per_image,
+ logits_per_text=logits_per_text,
+ text_embeds=text_embeds,
+ image_embeds=image_embeds,
+ text_model_output=text_outputs,
+ vision_model_output=vision_outputs,
+ )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/image/processing_image.py b/models/multimodal_encoder/languagebind/image/processing_image.py
new file mode 100755
index 0000000..1aafc79
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/processing_image.py
@@ -0,0 +1,82 @@
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+ if not isinstance(x, list):
+ return [x]
+ return x
+
+def get_image_transform(config):
+ config = config.vision_config
+ transform = transforms.Compose(
+ [
+ transforms.ToTensor(),
+ transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+ transforms.CenterCrop(224),
+ transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD) # assume image
+ ]
+ )
+ return transform
+
+
+def load_and_transform_image(image_path, transform):
+ image = Image.open(image_path).convert('RGB') if isinstance(image_path, str) else image_path
+ image_outputs = transform(image)
+ return image_outputs
+
+class LanguageBindImageProcessor(ProcessorMixin):
+ attributes = []
+ tokenizer_class = ("LanguageBindImageTokenizer")
+
+ def __init__(self, config, tokenizer=None, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.transform = get_image_transform(config)
+ self.image_processor = load_and_transform_image
+ self.tokenizer = tokenizer
+ self.image_mean = OPENAI_DATASET_MEAN
+ self.crop_size = {'height': 224, 'width': 224}
+
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+ if text is None and images is None:
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+ if text is not None:
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+ truncation=True, return_tensors=return_tensors, **kwargs)
+
+ if images is not None:
+ images = make_list_of_images(images)
+ image_features = [self.image_processor(image, self.transform) for image in images]
+ image_features = torch.stack(image_features)
+
+ if text is not None and images is not None:
+ encoding["pixel_values"] = image_features
+ return encoding
+ elif text is not None:
+ return encoding
+ else:
+ return {"pixel_values": image_features}
+
+ def preprocess(self, images, return_tensors):
+ return self.__call__(images=images, return_tensors=return_tensors)
+
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/image/tokenization_image.py b/models/multimodal_encoder/languagebind/image/tokenization_image.py
new file mode 100755
index 0000000..593423d
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/tokenization_image.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "lb203/LanguageBind-Image": "https://huggingface.co/lb203/LanguageBind-Image/resolve/main/vocab.json",
+ },
+ "merges_file": {
+ "lb203/LanguageBind-Image": "https://huggingface.co/lb203/LanguageBind-Image/resolve/main/merges.txt",
+ },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+ "lb203/LanguageBind-Image": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+ "lb203/LanguageBind-Image": {},
+}
+
+class LanguageBindImageTokenizer(CLIPTokenizer):
+ """
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ unk_token="<|endoftext|>",
+ bos_token="<|startoftext|>",
+ eos_token="<|endoftext|>",
+ pad_token="<|endoftext|>", # hack to enable padding
+ **kwargs,
+ ):
+ super(LanguageBindImageTokenizer, self).__init__(
+ vocab_file,
+ merges_file,
+ errors,
+ unk_token,
+ bos_token,
+ eos_token,
+ pad_token, # hack to enable padding
+ **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/thermal/configuration_thermal.py b/models/multimodal_encoder/languagebind/thermal/configuration_thermal.py
new file mode 100755
index 0000000..fd6cedd
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/configuration_thermal.py
@@ -0,0 +1,423 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 49408):
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`CLIPModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ max_position_embeddings (`int`, *optional*, defaults to 77):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPTextConfig()
+
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPTextModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "clip_text_model"
+
+ def __init__(
+ self,
+ vocab_size=49408,
+ hidden_size=512,
+ intermediate_size=2048,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=8,
+ max_position_embeddings=77,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+ pad_token_id=1,
+ bos_token_id=49406,
+ eos_token_id=49407,
+ **kwargs,
+ ):
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.add_time_attn = False ######################################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the text config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["text_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 32):
+ The size (resolution) of each patch.
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPVisionConfig()
+
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPVisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "clip_vision_model"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ intermediate_size=3072,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ num_channels=3,
+ image_size=224,
+ patch_size=32,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+
+ add_time_attn=False, ################################
+ num_frames=1, ################################
+ force_patch_dropout=0.0, ################################
+ lora_r=2, ################################
+ lora_alpha=16, ################################
+ lora_dropout=0.0, ################################
+ num_mel_bins=0.0, ################################
+ target_length=0.0, ################################
+ video_decode_backend='decord', #########################
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+
+ self.add_time_attn = add_time_attn ################
+ self.num_frames = num_frames ################
+ self.force_patch_dropout = force_patch_dropout ################
+ self.lora_r = lora_r ################
+ self.lora_alpha = lora_alpha ################
+ self.lora_dropout = lora_dropout ################
+ self.num_mel_bins = num_mel_bins ################
+ self.target_length = target_length ################
+ self.video_decode_backend = video_decode_backend ################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the vision config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["vision_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindThermalConfig(PretrainedConfig):
+ r"""
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+ vision_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+ projection_dim (`int`, *optional*, defaults to 512):
+ Dimentionality of text and vision projection layers.
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+ kwargs (*optional*):
+ Dictionary of keyword arguments.
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPConfig, CLIPModel
+
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPConfig()
+
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+ >>> # Initializing a CLIPText and CLIPVision configuration
+ >>> config_text = CLIPTextConfig()
+ >>> config_vision = CLIPVisionConfig()
+
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+ ```"""
+
+ model_type = "LanguageBindThermal"
+ is_composition = True
+
+ def __init__(
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+ ):
+ # If `_config_dict` exist, we use them for the backward compatibility.
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+ # of confusion!).
+ text_config_dict = kwargs.pop("text_config_dict", None)
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+ super().__init__(**kwargs)
+
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+ if text_config_dict is not None:
+ if text_config is None:
+ text_config = {}
+
+ # This is the complete result when using `text_config_dict`.
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+ for key, value in _text_config_dict.items():
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+ # If specified in `text_config_dict`
+ if key in text_config_dict:
+ message = (
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+ f'The value `text_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+ f'value `text_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
+ text_config.update(_text_config_dict)
+
+ if vision_config_dict is not None:
+ if vision_config is None:
+ vision_config = {}
+
+ # This is the complete result when using `vision_config_dict`.
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+ # convert keys to string instead of integer
+ if "id2label" in _vision_config_dict:
+ _vision_config_dict["id2label"] = {
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
+ }
+
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+ for key, value in _vision_config_dict.items():
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+ # If specified in `vision_config_dict`
+ if key in vision_config_dict:
+ message = (
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+ f'The value `vision_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+ vision_config.update(_vision_config_dict)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+ self.text_config = CLIPTextConfig(**text_config)
+ self.vision_config = CLIPVisionConfig(**vision_config)
+
+ self.projection_dim = projection_dim
+ self.logit_scale_init_value = logit_scale_init_value
+ self.initializer_factor = 1.0
+
+ @classmethod
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+ r"""
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+ configuration.
+
+ Returns:
+ [`CLIPConfig`]: An instance of a configuration object
+ """
+
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+ def to_dict(self):
+ """
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+ Returns:
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ """
+ output = copy.deepcopy(self.__dict__)
+ output["text_config"] = self.text_config.to_dict()
+ output["vision_config"] = self.vision_config.to_dict()
+ output["model_type"] = self.__class__.model_type
+ return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/thermal/modeling_thermal.py b/models/multimodal_encoder/languagebind/thermal/modeling_thermal.py
new file mode 100755
index 0000000..f0323b3
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/modeling_thermal.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+ CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_thermal import LanguageBindThermalConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+ """
+ https://arxiv.org/abs/2212.00794
+ """
+
+ def __init__(self, prob, exclude_first_token=True):
+ super().__init__()
+ assert 0 <= prob < 1.
+ self.prob = prob
+ self.exclude_first_token = exclude_first_token # exclude CLS token
+
+ def forward(self, x, B, T):
+ if not self.training or self.prob == 0.:
+ return x
+
+ if self.exclude_first_token:
+ cls_tokens, x = x[:, :1], x[:, 1:]
+ else:
+ cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+ batch = x.size()[0]
+ num_tokens = x.size()[1]
+
+ batch_indices = torch.arange(batch)
+ batch_indices = batch_indices[..., None]
+
+ keep_prob = 1 - self.prob
+ num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+ if T == 1:
+ rand = torch.randn(batch, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ else:
+ rand = torch.randn(B, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+ patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+ x = x[batch_indices, patch_indices_keep]
+
+ if self.exclude_first_token:
+ x = torch.cat((cls_tokens, x), dim=1)
+
+ return x
+
+class CLIPEncoderLayer(nn.Module):
+ def __init__(self, config: LanguageBindThermalConfig):
+ super().__init__()
+ self.embed_dim = config.hidden_size
+ self.self_attn = CLIPAttention(config)
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.mlp = CLIPMLP(config)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ self.add_time_attn = config.add_time_attn
+ if self.add_time_attn:
+ self.t = config.num_frames
+ self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+ nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+ self.embed_dim = config.hidden_size
+ self.temporal_attn = CLIPAttention(config)
+ self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.temporal_mlp = CLIPMLP(config)
+ self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ causal_attention_mask: torch.Tensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+
+
+ if self.add_time_attn:
+ bt, n, d = hidden_states.shape
+ t = self.t
+
+ # time embed
+ if t != 1:
+ n = hidden_states.shape[1]
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+ hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # time attn
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm1(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.temporal_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm2(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm2(hidden_states)
+ hidden_states = self.temporal_mlp(hidden_states)
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # spatial attn
+ residual = hidden_states
+
+ hidden_states = self.layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.layer_norm2(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = LanguageBindThermalConfig
+ base_model_prefix = "clip"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ factor = self.config.initializer_factor
+ if isinstance(module, CLIPTextEmbeddings):
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ elif isinstance(module, CLIPVisionEmbeddings):
+ factor = self.config.initializer_factor
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+ elif isinstance(module, CLIPAttention):
+ factor = self.config.initializer_factor
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ out_proj_std = (module.embed_dim**-0.5) * factor
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+ elif isinstance(module, CLIPMLP):
+ factor = self.config.initializer_factor
+ in_proj_std = (
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ )
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+ nn.init.normal_(module.fc1.weight, std=fc_std)
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
+ elif isinstance(module, LanguageBindThermal):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPVisionModelWithProjection):
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPTextModelWithProjection):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+
+ if isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, CLIPEncoder):
+ module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ return_loss (`bool`, *optional*):
+ Whether or not to return the contrastive loss.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+ """
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`CLIPEncoderLayer`].
+
+ Args:
+ config: CLIPConfig
+ """
+
+ def __init__(self, config: LanguageBindThermalConfig):
+ super().__init__()
+ self.config = config
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask: Optional[torch.Tensor] = None,
+ causal_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ hidden_states = inputs_embeds
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(encoder_layer),
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = CLIPTextEmbeddings(config)
+ self.encoder = CLIPEncoder(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is None:
+ raise ValueError("You have to specify input_ids")
+
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+ pooled_output = last_hidden_state[
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+ ]
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The text model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+ config_class = CLIPTextConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__(config)
+ self.text_model = CLIPTextTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.text_model.embeddings.token_embedding
+
+ def set_input_embeddings(self, value):
+ self.text_model.embeddings.token_embedding = value
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPTextModel
+
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+class CLIPVisionTransformer(nn.Module):
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+
+ self.embeddings = CLIPVisionEmbeddings(config)
+ self.patch_dropout = PatchDropout(config.force_patch_dropout)
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+ self.encoder = CLIPEncoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+ ######################################
+ if len(pixel_values.shape) == 7:
+ b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+ # print(pixel_values.shape)
+ B = b_new * pair_new * bs_new
+ pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+ elif len(pixel_values.shape) == 5:
+ B, _, T, _, _ = pixel_values.shape
+ # print(pixel_values.shape)
+ pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+ else:
+ # print(pixel_values.shape)
+ B, _, _, _ = pixel_values.shape
+ T = 1
+ ###########################
+ hidden_states = self.embeddings(pixel_values)
+
+ hidden_states = self.patch_dropout(hidden_states, B, T) ##############################################
+
+ hidden_states = self.pre_layrnorm(hidden_states)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ pooled_output = last_hidden_state[:, 0, :]
+ pooled_output = self.post_layernorm(pooled_output)
+
+ pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The vision model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+ config_class = CLIPVisionConfig
+ main_input_name = "pixel_values"
+
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__(config)
+ self.vision_model = CLIPVisionTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.vision_model.embeddings.patch_embedding
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPVisionModel
+
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindThermal(CLIPPreTrainedModel):
+ config_class = LanguageBindThermalConfig
+
+ def __init__(self, config: LanguageBindThermalConfig):
+ super().__init__(config)
+
+ if not isinstance(config.text_config, CLIPTextConfig):
+ raise ValueError(
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
+ f" {type(config.text_config)}."
+ )
+
+ if not isinstance(config.vision_config, CLIPVisionConfig):
+ raise ValueError(
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+ f" {type(config.vision_config)}."
+ )
+
+ text_config = config.text_config
+ vision_config = config.vision_config
+ self.add_time_attn = vision_config.add_time_attn
+ self.lora_r = vision_config.lora_r
+ self.lora_alpha = vision_config.lora_alpha
+ self.lora_dropout = vision_config.lora_dropout
+
+ self.projection_dim = config.projection_dim
+ self.text_embed_dim = text_config.hidden_size
+ self.vision_embed_dim = vision_config.hidden_size
+
+ self.text_model = CLIPTextTransformer(text_config)
+ self.vision_model = CLIPVisionTransformer(vision_config)
+
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+ # Initialize weights and apply final processing
+ self.post_init()
+ self.convert_to_lora()
+ self.resize_pos(self.vision_model.embeddings, vision_config)
+
+ def convert_to_lora(self):
+ if self.lora_r == 0:
+ return
+ if self.add_time_attn:
+ target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+ "temporal_attn.q_proj", "temporal_attn.out_proj",
+ "temporal_mlp.fc1", "temporal_mlp.fc2"]
+ else:
+ target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+ config = LoraConfig(
+ r=self.lora_r, # 16
+ lora_alpha=self.lora_alpha, # 16
+ target_modules=target_modules, # self_attn.out_proj
+ lora_dropout=self.lora_dropout, # 0.1
+ bias="none",
+ modules_to_save=[],
+ )
+ self.vision_model.encoder.is_gradient_checkpointing = False
+ self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+ def resize_pos(self, m, vision_config):
+ # convert embedding
+ if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+ m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+ m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+ # pos resize
+ old_pos_embed_state_dict = m.position_embedding.state_dict()
+ old_pos_embed = old_pos_embed_state_dict['weight']
+ dtype = old_pos_embed.dtype
+ grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+ if new_seq_len == old_pos_embed.shape[0]:
+ # m.to(args.device)
+ return
+
+ m.num_patches = grid_size[0] * grid_size[1]
+ m.num_positions = m.num_patches + 1
+ m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+ new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+ if extra_tokens:
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+ else:
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
+ old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+ # if is_master(args):
+ # logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+ pos_emb_img = F.interpolate(
+ pos_emb_img,
+ size=grid_size,
+ mode='bicubic',
+ antialias=True,
+ align_corners=False,
+ )
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+ if pos_emb_tok is not None:
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+ else:
+ new_pos_embed = pos_emb_img
+ old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+ m.position_embedding = new_position_embedding
+ m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+ # m.to(args.device)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ def get_text_features(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = text_outputs[1]
+ text_features = self.text_projection(pooled_output)
+
+ return text_features
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ def get_image_features(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = vision_outputs[1] # pooled_output
+ image_features = self.visual_projection(pooled_output)
+
+ return image_features
+
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindThermalConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CLIPOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+ ... )
+
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ image_embeds = vision_outputs[1]
+ image_embeds = self.visual_projection(image_embeds)
+
+ text_embeds = text_outputs[1]
+ text_embeds = self.text_projection(text_embeds)
+
+ # normalized features
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+ # cosine similarity as logits
+ logit_scale = self.logit_scale.exp()
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+ logits_per_image = logits_per_text.t()
+
+ loss = None
+ if return_loss:
+ loss = clip_loss(logits_per_text)
+
+ if not return_dict:
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+ return ((loss,) + output) if loss is not None else output
+
+ return CLIPOutput(
+ loss=loss,
+ logits_per_image=logits_per_image,
+ logits_per_text=logits_per_text,
+ text_embeds=text_embeds,
+ image_embeds=image_embeds,
+ text_model_output=text_outputs,
+ vision_model_output=vision_outputs,
+ )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/thermal/processing_thermal.py b/models/multimodal_encoder/languagebind/thermal/processing_thermal.py
new file mode 100755
index 0000000..36ed1f0
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/processing_thermal.py
@@ -0,0 +1,77 @@
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+ if not isinstance(x, list):
+ return [x]
+ return x
+
+def get_thermal_transform(config):
+ config = config.vision_config
+ transform = transforms.Compose(
+ [
+ transforms.ToTensor(),
+ transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+ transforms.CenterCrop(224),
+ transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD) # assume image
+ ]
+ )
+ return transform
+
+
+def load_and_transform_thermal(thermal_path, transform):
+ thermal = Image.open(thermal_path)
+ thermal_outputs = transform(thermal)
+ return thermal_outputs
+
+class LanguageBindThermalProcessor(ProcessorMixin):
+ attributes = []
+ tokenizer_class = ("LanguageBindThermalTokenizer")
+
+ def __init__(self, config, tokenizer=None, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.transform = get_thermal_transform(config)
+ self.image_processor = load_and_transform_thermal
+ self.tokenizer = tokenizer
+
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+ if text is None and images is None:
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+ if text is not None:
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+ truncation=True, return_tensors=return_tensors, **kwargs)
+
+ if images is not None:
+ images = make_list_of_images(images)
+ image_features = [self.image_processor(image, self.transform) for image in images]
+ image_features = torch.stack(image_features)
+
+ if text is not None and images is not None:
+ encoding["pixel_values"] = image_features
+ return encoding
+ elif text is not None:
+ return encoding
+ else:
+ return {"pixel_values": image_features}
+
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/thermal/tokenization_thermal.py b/models/multimodal_encoder/languagebind/thermal/tokenization_thermal.py
new file mode 100755
index 0000000..a4ebb56
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/tokenization_thermal.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "lb203/LanguageBind-Thermal": "https://huggingface.co/lb203/LanguageBind-Thermal/resolve/main/vocab.json",
+ },
+ "merges_file": {
+ "lb203/LanguageBind-Thermal": "https://huggingface.co/lb203/LanguageBind-Thermal/resolve/main/merges.txt",
+ },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+ "lb203/LanguageBind-Thermal": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+ "lb203/LanguageBind-Thermal": {},
+}
+
+class LanguageBindThermalTokenizer(CLIPTokenizer):
+ """
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ unk_token="<|endoftext|>",
+ bos_token="<|startoftext|>",
+ eos_token="<|endoftext|>",
+ pad_token="<|endoftext|>", # hack to enable padding
+ **kwargs,
+ ):
+ super(LanguageBindThermalTokenizer, self).__init__(
+ vocab_file,
+ merges_file,
+ errors,
+ unk_token,
+ bos_token,
+ eos_token,
+ pad_token, # hack to enable padding
+ **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/video/check_video_chatgpt_videos.py b/models/multimodal_encoder/languagebind/video/check_video_chatgpt_videos.py
new file mode 100755
index 0000000..eaddfff
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/check_video_chatgpt_videos.py
@@ -0,0 +1,56 @@
+import torch
+import cv2
+import decord
+from decord import VideoReader, cpu
+decord.bridge.set_bridge('torch')
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision.transforms import Compose, Lambda, ToTensor
+from torchvision.transforms._transforms_video import NormalizeVideo, RandomCropVideo, RandomHorizontalFlipVideo, CenterCropVideo
+from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
+
+import os
+import glob
+from tqdm import tqdm
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def get_video_transform():
+ # import pdb; pdb.set_trace()
+
+
+ transform = Compose(
+ [
+ # UniformTemporalSubsample(num_frames),
+ Lambda(lambda x: x / 255.0),
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+ ShortSideScale(size=224),
+ CenterCropVideo(224),
+ RandomHorizontalFlipVideo(p=0.5),
+ ]
+ )
+
+ return transform
+
+if __name__ == '__main__':
+ directory = '/comp_robot/lushunlin/MotionGPT/video_datasets/videochatgpt/videochatgpt_tune'
+ mp4_files = glob.glob(os.path.join(directory, '*.mp4'))
+ # import pdb; pdb.set_trace()
+ transform = get_video_transform()
+ for video_path in tqdm(mp4_files):
+ try:
+ decord.bridge.set_bridge('torch')
+ decord_vr = VideoReader(video_path, ctx=cpu(0))
+ duration = len(decord_vr)
+ frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
+ video_data = decord_vr.get_batch(frame_id_list)
+ video_data = video_data.permute(3, 0, 1, 2) # (T, H, W, C) -> (C, T, H, W)
+ video_outputs = transform(video_data)
+ except:
+ with open('/comp_robot/lushunlin/MotionGPT/records/decord_error.txt', 'a') as f:
+ f.write(video_path+'\n')
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/video/configuration_video.py b/models/multimodal_encoder/languagebind/video/configuration_video.py
new file mode 100755
index 0000000..4b108ec
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/configuration_video.py
@@ -0,0 +1,423 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 49408):
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`CLIPModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ max_position_embeddings (`int`, *optional*, defaults to 77):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPTextConfig()
+
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPTextModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "clip_text_model"
+
+ def __init__(
+ self,
+ vocab_size=49408,
+ hidden_size=512,
+ intermediate_size=2048,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=8,
+ max_position_embeddings=77,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+ pad_token_id=1,
+ bos_token_id=49406,
+ eos_token_id=49407,
+ **kwargs,
+ ):
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.add_time_attn = False ######################################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the text config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["text_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 32):
+ The size (resolution) of each patch.
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_factor (`float`, *optional*, defaults to 1):
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+ testing).
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPVisionConfig()
+
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPVisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "clip_vision_model"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ intermediate_size=3072,
+ projection_dim=512,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ num_channels=3,
+ image_size=224,
+ patch_size=32,
+ hidden_act="quick_gelu",
+ layer_norm_eps=1e-5,
+ attention_dropout=0.0,
+ initializer_range=0.02,
+ initializer_factor=1.0,
+
+ add_time_attn=False, ################################
+ num_frames=1, ################################
+ force_patch_dropout=0.0, ################################
+ lora_r=2, ################################
+ lora_alpha=16, ################################
+ lora_dropout=0.0, ################################
+ num_mel_bins=0.0, ################################
+ target_length=0.0, ################################
+ video_decode_backend='decord', #########################
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.initializer_range = initializer_range
+ self.initializer_factor = initializer_factor
+ self.attention_dropout = attention_dropout
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+
+ self.add_time_attn = add_time_attn ################
+ self.num_frames = num_frames ################
+ self.force_patch_dropout = force_patch_dropout ################
+ self.lora_r = lora_r ################
+ self.lora_alpha = lora_alpha ################
+ self.lora_dropout = lora_dropout ################
+ self.num_mel_bins = num_mel_bins ################
+ self.target_length = target_length ################
+ self.video_decode_backend = video_decode_backend ################
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+ # get the vision config dict if we are loading from CLIPConfig
+ if config_dict.get("model_type") == "clip":
+ config_dict = config_dict["vision_config"]
+
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindVideoConfig(PretrainedConfig):
+ r"""
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+ vision_config (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+ projection_dim (`int`, *optional*, defaults to 512):
+ Dimentionality of text and vision projection layers.
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+ kwargs (*optional*):
+ Dictionary of keyword arguments.
+
+ Example:
+
+ ```python
+ >>> from transformers import CLIPConfig, CLIPModel
+
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPConfig()
+
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+ >>> # Initializing a CLIPText and CLIPVision configuration
+ >>> config_text = CLIPTextConfig()
+ >>> config_vision = CLIPVisionConfig()
+
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+ ```"""
+
+ model_type = "LanguageBindVideo"
+ is_composition = True
+
+ def __init__(
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+ ):
+ # If `_config_dict` exist, we use them for the backward compatibility.
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+ # of confusion!).
+ text_config_dict = kwargs.pop("text_config_dict", None)
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+ super().__init__(**kwargs)
+
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+ if text_config_dict is not None:
+ if text_config is None:
+ text_config = {}
+
+ # This is the complete result when using `text_config_dict`.
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+ for key, value in _text_config_dict.items():
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+ # If specified in `text_config_dict`
+ if key in text_config_dict:
+ message = (
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+ f'The value `text_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+ f'value `text_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
+ text_config.update(_text_config_dict)
+
+ if vision_config_dict is not None:
+ if vision_config is None:
+ vision_config = {}
+
+ # This is the complete result when using `vision_config_dict`.
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+ # convert keys to string instead of integer
+ if "id2label" in _vision_config_dict:
+ _vision_config_dict["id2label"] = {
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
+ }
+
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+ for key, value in _vision_config_dict.items():
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+ # If specified in `vision_config_dict`
+ if key in vision_config_dict:
+ message = (
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+ )
+ # If inferred from default argument values (just to be super careful)
+ else:
+ message = (
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+ f'The value `vision_config["{key}"]` will be overriden.'
+ )
+ logger.warning(message)
+
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+ vision_config.update(_vision_config_dict)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+ self.text_config = CLIPTextConfig(**text_config)
+ self.vision_config = CLIPVisionConfig(**vision_config)
+
+ self.projection_dim = projection_dim
+ self.logit_scale_init_value = logit_scale_init_value
+ self.initializer_factor = 1.0
+
+ @classmethod
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+ r"""
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+ configuration.
+
+ Returns:
+ [`CLIPConfig`]: An instance of a configuration object
+ """
+
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+ def to_dict(self):
+ """
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+ Returns:
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ """
+ output = copy.deepcopy(self.__dict__)
+ output["text_config"] = self.text_config.to_dict()
+ output["vision_config"] = self.vision_config.to_dict()
+ output["model_type"] = self.__class__.model_type
+ return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/video/modeling_video.py b/models/multimodal_encoder/languagebind/video/modeling_video.py
new file mode 100755
index 0000000..cb5c621
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/modeling_video.py
@@ -0,0 +1,1033 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+ CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_video import LanguageBindVideoConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+ """
+ https://arxiv.org/abs/2212.00794
+ """
+
+ def __init__(self, prob, exclude_first_token=True):
+ super().__init__()
+ assert 0 <= prob < 1.
+ self.prob = prob
+ self.exclude_first_token = exclude_first_token # exclude CLS token
+
+ def forward(self, x, B, T):
+ if not self.training or self.prob == 0.:
+ return x
+
+ if self.exclude_first_token:
+ cls_tokens, x = x[:, :1], x[:, 1:]
+ else:
+ cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+ batch = x.size()[0]
+ num_tokens = x.size()[1]
+
+ batch_indices = torch.arange(batch)
+ batch_indices = batch_indices[..., None]
+
+ keep_prob = 1 - self.prob
+ num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+ if T == 1:
+ rand = torch.randn(batch, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ else:
+ rand = torch.randn(B, num_tokens)
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+ patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+ patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+ x = x[batch_indices, patch_indices_keep]
+
+ if self.exclude_first_token:
+ x = torch.cat((cls_tokens, x), dim=1)
+
+ return x
+
+class CLIPEncoderLayer(nn.Module):
+ def __init__(self, config: LanguageBindVideoConfig):
+ super().__init__()
+ self.embed_dim = config.hidden_size
+ self.self_attn = CLIPAttention(config)
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.mlp = CLIPMLP(config)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ self.add_time_attn = config.add_time_attn
+ if self.add_time_attn:
+ self.t = config.num_frames
+ self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+ nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+ self.embed_dim = config.hidden_size
+ self.temporal_attn = CLIPAttention(config)
+ self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ # self.temporal_mlp = CLIPMLP(config)
+ # self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ causal_attention_mask: torch.Tensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+
+
+ if self.add_time_attn:
+ bt, n, d = hidden_states.shape
+ t = self.t
+
+ # time embed
+ if t != 1:
+ n = hidden_states.shape[1]
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+ hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # time attn
+ residual = hidden_states
+ hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # hidden_states = self.layer_norm1(hidden_states) # share layernorm
+ hidden_states = self.temporal_layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.temporal_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # residual = hidden_states
+ # hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+ # # hidden_states = self.layer_norm2(hidden_states) # share layernorm
+ # hidden_states = self.temporal_layer_norm2(hidden_states)
+ # hidden_states = self.temporal_mlp(hidden_states)
+ # hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+ # spatial attn
+ residual = hidden_states
+
+ hidden_states = self.layer_norm1(hidden_states)
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.layer_norm2(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = LanguageBindVideoConfig
+ base_model_prefix = "clip"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ factor = self.config.initializer_factor
+ if isinstance(module, CLIPTextEmbeddings):
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+ elif isinstance(module, CLIPVisionEmbeddings):
+ factor = self.config.initializer_factor
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+ elif isinstance(module, CLIPAttention):
+ factor = self.config.initializer_factor
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ out_proj_std = (module.embed_dim**-0.5) * factor
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+ elif isinstance(module, CLIPMLP):
+ factor = self.config.initializer_factor
+ in_proj_std = (
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ )
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+ nn.init.normal_(module.fc1.weight, std=fc_std)
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
+ elif isinstance(module, LanguageBindVideo):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPVisionModelWithProjection):
+ nn.init.normal_(
+ module.visual_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+ elif isinstance(module, CLIPTextModelWithProjection):
+ nn.init.normal_(
+ module.text_projection.weight,
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
+
+ if isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, CLIPEncoder):
+ module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ return_loss (`bool`, *optional*):
+ Whether or not to return the contrastive loss.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+ """
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`CLIPEncoderLayer`].
+
+ Args:
+ config: CLIPConfig
+ """
+
+ def __init__(self, config: LanguageBindVideoConfig):
+ super().__init__()
+ self.config = config
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask: Optional[torch.Tensor] = None,
+ causal_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ hidden_states = inputs_embeds
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(encoder_layer),
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = CLIPTextEmbeddings(config)
+ self.encoder = CLIPEncoder(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is None:
+ raise ValueError("You have to specify input_ids")
+
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+ pooled_output = last_hidden_state[
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+ ]
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The text model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+ config_class = CLIPTextConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__(config)
+ self.text_model = CLIPTextTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.text_model.embeddings.token_embedding
+
+ def set_input_embeddings(self, value):
+ self.text_model.embeddings.token_embedding = value
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPTextModel
+
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+class CLIPVisionTransformer(nn.Module):
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+
+ self.embeddings = CLIPVisionEmbeddings(config)
+ self.patch_dropout = PatchDropout(config.force_patch_dropout)
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+ self.encoder = CLIPEncoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ # print('input video raw shape', pixel_values.shape)
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+ ######################################
+ if len(pixel_values.shape) == 7:
+ b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+ # print(pixel_values.shape)
+ B = b_new * pair_new * bs_new
+ pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+ elif len(pixel_values.shape) == 5:
+ B, _, T, _, _ = pixel_values.shape
+ # print(pixel_values.shape)
+ pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+ else:
+ # print(pixel_values.shape)
+ B, _, _, _ = pixel_values.shape
+ T = 1
+ ###########################
+ hidden_states = self.embeddings(pixel_values)
+ # print(B, T)
+ hidden_states = self.patch_dropout(hidden_states, B, T) ##############################################
+
+ hidden_states = self.pre_layrnorm(hidden_states)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ # print('video encoder last_hidden_state', last_hidden_state.shape)
+ pooled_output = last_hidden_state[:, 0, :]
+ pooled_output = self.post_layernorm(pooled_output)
+
+ pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+ #################################
+ encoder_outputs.hidden_states = [rearrange(i, '(b t) n c -> b t n c', b=B) for i in encoder_outputs.hidden_states]
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """The vision model from CLIP without any head or projection on top.""",
+ CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+ config_class = CLIPVisionConfig
+ main_input_name = "pixel_values"
+
+ def __init__(self, config: CLIPVisionConfig):
+ super().__init__(config)
+ self.vision_model = CLIPVisionTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.vision_model.embeddings.patch_embedding
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPVisionModel
+
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ return self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindVideo(CLIPPreTrainedModel):
+ config_class = LanguageBindVideoConfig
+
+ def __init__(self, config: LanguageBindVideoConfig):
+ super().__init__(config)
+
+ if not isinstance(config.text_config, CLIPTextConfig):
+ raise ValueError(
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
+ f" {type(config.text_config)}."
+ )
+
+ if not isinstance(config.vision_config, CLIPVisionConfig):
+ raise ValueError(
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+ f" {type(config.vision_config)}."
+ )
+
+ text_config = config.text_config
+ vision_config = config.vision_config
+ self.add_time_attn = vision_config.add_time_attn
+ self.lora_r = vision_config.lora_r
+ self.lora_alpha = vision_config.lora_alpha
+ self.lora_dropout = vision_config.lora_dropout
+
+ self.projection_dim = config.projection_dim
+ self.text_embed_dim = text_config.hidden_size
+ self.vision_embed_dim = vision_config.hidden_size
+
+ self.text_model = CLIPTextTransformer(text_config)
+ self.vision_model = CLIPVisionTransformer(vision_config)
+
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+ # Initialize weights and apply final processing
+ self.post_init()
+ # self.convert_to_lora() ############################################
+ # self.resize_pos(self.vision_model.embeddings, vision_config)
+
+ def convert_to_lora(self):
+ if self.lora_r == 0:
+ return
+ if self.add_time_attn:
+ target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+ "temporal_attn.q_proj", "temporal_attn.out_proj",
+ "temporal_mlp.fc1", "temporal_mlp.fc2"]
+ else:
+ target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+ config = LoraConfig(
+ r=self.lora_r, # 16
+ lora_alpha=self.lora_alpha, # 16
+ target_modules=target_modules, # self_attn.out_proj
+ lora_dropout=self.lora_dropout, # 0.1
+ bias="none",
+ modules_to_save=[],
+ )
+ self.vision_model.encoder.is_gradient_checkpointing = False
+ self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+ def resize_pos(self, m, vision_config):
+ # convert embedding
+ if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+ m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+ m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+ # pos resize
+ old_pos_embed_state_dict = m.position_embedding.state_dict()
+ old_pos_embed = old_pos_embed_state_dict['weight']
+ dtype = old_pos_embed.dtype
+ grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+ if new_seq_len == old_pos_embed.shape[0]:
+ # m.to(args.device)
+ return
+
+ m.num_patches = grid_size[0] * grid_size[1]
+ m.num_positions = m.num_patches + 1
+ m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+ new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+ if extra_tokens:
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+ else:
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
+ old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+ # if is_master(args):
+ # logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+ pos_emb_img = F.interpolate(
+ pos_emb_img,
+ size=grid_size,
+ mode='bicubic',
+ antialias=True,
+ align_corners=False,
+ )
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+ if pos_emb_tok is not None:
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+ else:
+ new_pos_embed = pos_emb_img
+ old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+ m.position_embedding = new_position_embedding
+ m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+ # m.to(args.device)
+
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+ def get_text_features(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = text_outputs[1]
+ text_features = self.text_projection(pooled_output)
+
+ return text_features
+
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+ def get_image_features(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.FloatTensor:
+ r"""
+ Returns:
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="pt")
+
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ pooled_output = vision_outputs[1] # pooled_output
+ image_features = self.visual_projection(pooled_output)
+
+ return image_features
+
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindVideoConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CLIPOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, CLIPModel
+
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+ ... )
+
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ image_embeds = vision_outputs[1]
+ image_embeds = self.visual_projection(image_embeds)
+
+ text_embeds = text_outputs[1]
+ text_embeds = self.text_projection(text_embeds)
+
+ # normalized features
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+ # cosine similarity as logits
+ logit_scale = self.logit_scale.exp()
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+ logits_per_image = logits_per_text.t()
+
+ loss = None
+ if return_loss:
+ loss = clip_loss(logits_per_text)
+
+ if not return_dict:
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+ return ((loss,) + output) if loss is not None else output
+
+ return CLIPOutput(
+ loss=loss,
+ logits_per_image=logits_per_image,
+ logits_per_text=logits_per_text,
+ text_embeds=text_embeds,
+ image_embeds=image_embeds,
+ text_model_output=text_outputs,
+ vision_model_output=vision_outputs,
+ )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/video/processing_video.py b/models/multimodal_encoder/languagebind/video/processing_video.py
new file mode 100755
index 0000000..92682ef
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/processing_video.py
@@ -0,0 +1,213 @@
+
+import torch
+import cv2
+import decord
+from decord import VideoReader, cpu
+decord.bridge.set_bridge('torch')
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision.transforms import Compose, Lambda, ToTensor
+from torchvision.transforms._transforms_video import NormalizeVideo, RandomCropVideo, RandomHorizontalFlipVideo, CenterCropVideo
+from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
+
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+ if not isinstance(x, list):
+ return [x]
+ return x
+
+def get_video_transform(config):
+ config = config.vision_config
+ # import pdb; pdb.set_trace()
+ if config.video_decode_backend == 'pytorchvideo':
+ transform = ApplyTransformToKey(
+ key="video",
+ transform=Compose(
+ [
+ UniformTemporalSubsample(config.num_frames),
+ Lambda(lambda x: x / 255.0),
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+ ShortSideScale(size=224),
+ CenterCropVideo(224),
+ RandomHorizontalFlipVideo(p=0.5),
+ ]
+ ),
+ )
+
+ elif config.video_decode_backend == 'decord':
+
+ transform = Compose(
+ [
+ # UniformTemporalSubsample(num_frames),
+ Lambda(lambda x: x / 255.0),
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+ ShortSideScale(size=224),
+ CenterCropVideo(224),
+ RandomHorizontalFlipVideo(p=0.5),
+ ]
+ )
+
+ elif config.video_decode_backend == 'opencv':
+ transform = Compose(
+ [
+ # UniformTemporalSubsample(num_frames),
+ Lambda(lambda x: x / 255.0),
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+ ShortSideScale(size=224),
+ CenterCropVideo(224),
+ RandomHorizontalFlipVideo(p=0.5),
+ ]
+ )
+ else:
+ raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+ return transform
+
+
+def load_and_transform_video(
+ video_path,
+ transform,
+ video_decode_backend='opencv',
+ clip_start_sec=0.0,
+ clip_end_sec=None,
+ num_frames=8,
+):
+ if video_decode_backend == 'pytorchvideo':
+ # decord pyav
+ video = EncodedVideo.from_path(video_path, decoder="decord", decode_audio=False)
+ duration = video.duration
+ start_sec = clip_start_sec # secs
+ end_sec = clip_end_sec if clip_end_sec is not None else duration # secs
+ video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+ video_outputs = transform(video_data)
+
+ elif video_decode_backend == 'decord':
+ # if '/comp_robot/lushunlin/MotionGPT/video_datasets/videochatgpt/videochatgpt_tune/v_BkBbzC6nIvA.mp4' == video_path \
+ # or '/comp_robot/lushunlin/MotionGPT/video_datasets/videochatgpt/videochatgpt_tune/v_fWVUEOVUzS4.mp4' == video_path:
+ # import pdb; pdb.set_trace()
+ # import pdb; pdb.set_trace()
+ # try:
+
+ decord.bridge.set_bridge('torch')
+ decord_vr = VideoReader(video_path, ctx=cpu(0))
+ # duration = len(decord_vr)
+ end_idx = len(decord_vr) - 1
+ start_idx = 0
+
+
+ if clip_end_sec is not None:
+ fps = float(decord_vr.get_avg_fps())
+ start_idx = max(start_idx, round(clip_start_sec * fps))
+ end_idx = min(round(clip_end_sec * fps), end_idx)
+
+ frame_id_list = np.linspace(start_idx, end_idx, num_frames, dtype=int)
+ # import pdb; pdb.set_trace()
+ video_data = decord_vr.get_batch(frame_id_list)
+
+
+ video_data = video_data.permute(3, 0, 1, 2) # (T, H, W, C) -> (C, T, H, W)
+ video_outputs = transform(video_data)
+ # import pdb; pdb.set_trace()
+ # except:
+ # with open('/comp_robot/lushunlin/MotionGPT/records/decord_error.txt', 'a') as f:
+ # f.write(video_path+'\n')
+ # print(video_path)
+ # import pdb; pdb.set_trace()
+
+ ###################
+
+ elif video_decode_backend == 'opencv':
+ cv2_vr = cv2.VideoCapture(video_path)
+ duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
+ frame_id_list = np.linspace(0, duration-5, num_frames, dtype=int)
+
+ video_data = []
+ for frame_idx in frame_id_list:
+ cv2_vr.set(1, frame_idx)
+ ret, frame = cv2_vr.read()
+ if not ret:
+ raise ValueError(f'video error at {video_path}')
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
+ cv2_vr.release()
+ video_data = torch.stack(video_data, dim=1)
+ video_outputs = transform(video_data)
+ else:
+ raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+ # import pdb; pdb.set_trace()
+ return video_outputs
+
+class LanguageBindVideoProcessor(ProcessorMixin):
+ attributes = []
+ tokenizer_class = ("LanguageBindVideoTokenizer")
+
+ def __init__(self, config, tokenizer=None, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ # self.config.vision_config.video_decode_backend = 'opencv'
+ self.transform = get_video_transform(config)
+ self.image_processor = load_and_transform_video
+ self.tokenizer = tokenizer
+
+
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, bound=None, **kwargs):
+ if bound is not None:
+ start = bound[0]
+ end = bound[1]
+ else:
+ start = 0.0
+ end = None
+ # import pdb; pdb.set_trace()
+ if text is None and images is None:
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+ if text is not None:
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+ truncation=True, return_tensors=return_tensors, **kwargs)
+
+ if images is not None:
+ images = make_list_of_images(images)
+ # import pdb; pdb.set_trace()
+ image_features = []
+ for image in images:
+ # try:
+ image_features.append(self.image_processor(image, self.transform,video_decode_backend=self.config.vision_config.video_decode_backend, clip_start_sec=start, clip_end_sec=end, num_frames=self.config.vision_config.num_frames))
+ # except:
+ # pass
+ # image_features = [self.image_processor(image, self.transform,
+ # video_decode_backend=self.config.vision_config.video_decode_backend,
+ # num_frames=self.config.vision_config.num_frames) for image in images]
+ # image_features = [torch.rand(3, 8, 224, 224) for image in images]
+ # import pdb; pdb.set_trace()
+ image_features = torch.stack(image_features)
+
+ if text is not None and images is not None:
+ encoding["pixel_values"] = image_features
+ return encoding
+ elif text is not None:
+ return encoding
+ else:
+ return {"pixel_values": image_features}
+
+ def preprocess(self, images, return_tensors):
+ return self.__call__(images=images, return_tensors=return_tensors)
+
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
+ """
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/video/tokenization_video.py b/models/multimodal_encoder/languagebind/video/tokenization_video.py
new file mode 100755
index 0000000..2864429
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/tokenization_video.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json",
+ },
+ "merges_file": {
+ "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt",
+ },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+ "lb203/LanguageBind-Video": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+ "lb203/LanguageBind-Video": {},
+}
+
+class LanguageBindVideoTokenizer(CLIPTokenizer):
+ """
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ unk_token="<|endoftext|>",
+ bos_token="<|startoftext|>",
+ eos_token="<|endoftext|>",
+ pad_token="<|endoftext|>", # hack to enable padding
+ **kwargs,
+ ):
+ super(LanguageBindVideoTokenizer, self).__init__(
+ vocab_file,
+ merges_file,
+ errors,
+ unk_token,
+ bos_token,
+ eos_token,
+ pad_token, # hack to enable padding
+ **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/mae_encoder.py b/models/multimodal_encoder/mae_encoder.py
new file mode 100755
index 0000000..377883d
--- /dev/null
+++ b/models/multimodal_encoder/mae_encoder.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+
+from transformers import ViTMAEForPreTraining, AutoConfig, AutoImageProcessor
+
+
+class MAEVisionTower(nn.Module):
+ def __init__(self, vision_tower, args, cache_dir='./cache_dir', delay_load=False):
+ super().__init__()
+
+ self.is_loaded = False
+ self.cache_dir = cache_dir
+ self.vision_tower_name = vision_tower
+ self.select_layer = args.mm_vision_select_layer
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+ if not delay_load:
+ self.load_model()
+ else:
+ self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name, cache_dir=self.cache_dir)
+
+ def load_model(self):
+ self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name, cache_dir=self.cache_dir)
+ vision_tower = ViTMAEForPreTraining.from_pretrained(self.vision_tower_name, cache_dir=self.cache_dir)
+ self.vision_tower = vision_tower.vit
+ self.vision_tower.requires_grad_(False)
+
+ self.is_loaded = True
+
+ def feature_select(self, image_forward_outs):
+ image_features = image_forward_outs.hidden_states[self.select_layer]
+ if self.select_feature == 'patch':
+ image_features = image_features[:, 1:]
+ elif self.select_feature == 'cls_patch':
+ image_features = image_features
+ else:
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
+ # print(image_features.shape)
+ return image_features
+
+ @torch.no_grad()
+ def forward(self, images):
+ if type(images) is list:
+ image_features = []
+ for image in images:
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
+ image_features.append(image_feature)
+ else:
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+ return image_features
+
+ @property
+ def dummy_feature(self):
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+ @property
+ def dtype(self):
+ return self.vision_tower.dtype
+
+ @property
+ def device(self):
+ return self.vision_tower.device
+
+ @property
+ def config(self):
+ if self.is_loaded:
+ return self.vision_tower.config
+ else:
+ return self.cfg_only
+
+ @property
+ def hidden_size(self):
+ return self.config.hidden_size
+
+ @property
+ def num_patches(self):
+ return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/models/multimodal_projector/builder.py b/models/multimodal_projector/builder.py
new file mode 100755
index 0000000..8cc8e9d
--- /dev/null
+++ b/models/multimodal_projector/builder.py
@@ -0,0 +1,257 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import re
+
+from transformers import PretrainedConfig, Blip2PreTrainedModel, Blip2Config, Blip2QFormerModel
+
+
+class IdentityMap(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x, *args, **kwargs):
+ return x
+
+ @property
+ def config(self):
+ return {"mm_projector_type": 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+ def __init__(self, channels):
+ super().__init__()
+ self.pre_norm = nn.LayerNorm(channels)
+
+ self.proj = nn.Sequential(
+ nn.Linear(channels, channels),
+ nn.GELU(),
+ nn.Linear(channels, channels)
+ )
+ def forward(self, x):
+ x = self.pre_norm(x)
+ return x + self.proj(x)
+
+
+# def build_vision_projector(config, delay_load=False, **kwargs):
+# projector_type = getattr(config, 'mm_projector_type', 'linear')
+#
+# if projector_type == 'linear':
+# return nn.Linear(config.mm_hidden_size, config.hidden_size)
+#
+# mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+# if mlp_gelu_match:
+# mlp_depth = int(mlp_gelu_match.group(1))
+# modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+# for _ in range(1, mlp_depth):
+# modules.append(nn.GELU())
+# modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+# return nn.Sequential(*modules)
+#
+# if projector_type == 'identity':
+# return IdentityMap()
+#
+# raise ValueError(f'Unknown projector type: {projector_type}')
+
+
+class Blip2Model(Blip2PreTrainedModel):
+ def __init__(self, config: Blip2Config):
+ super().__init__(config)
+
+ self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+ self.qformer = Blip2QFormerModel(config.qformer_config)
+
+ # self.proj = nn.Linear(config.mm_hidden_size, config.hidden_size)
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size), nn.GELU(), nn.Linear(config.hidden_size, config.hidden_size)]
+ self.proj = nn.Sequential(*modules)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ r"""
+ Returns:
+ vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+ The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+ contains the image features, the pooled image features and the hidden states if
+ `output_hidden_states=True`.
+ Examples:
+ ```python
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import Blip2Processor, Blip2Model
+
+ >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+ >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+ >>> model.to(device) # doctest: +IGNORE_RESULT
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+ >>> qformer_outputs = model.get_qformer_features(**inputs)
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # vision_outputs = self.vision_model(
+ # pixel_values=pixel_values,
+ # output_attentions=output_attentions,
+ # output_hidden_states=output_hidden_states,
+ # return_dict=return_dict,
+ # )
+ #
+ # image_embeds = vision_outputs[0]
+ # image_embeds = self.proj(pixel_values)
+ image_embeds = pixel_values
+
+
+ # print('pixel_values to proj', pixel_values.shape, image_embeds.shape)
+ # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+ image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+ query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+ query_outputs = self.qformer(
+ query_embeds=query_tokens,
+ encoder_hidden_states=image_embeds,
+ encoder_attention_mask=image_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ ).last_hidden_state
+ # print('qformer out', query_outputs.shape)
+ query_outputs = self.proj(query_outputs)
+ return query_outputs
+
+
+def qformer_config_template(config, projector_type):
+ pattern = r"qformer(\d+)_(\d+)"
+
+ match = re.search(pattern, projector_type)
+ num_hidden_layers = int(match.group(1))
+ num_query_tokens = int(match.group(2))
+
+ qformer_config = type('Blip2Config', (PretrainedConfig,), {
+ "initializer_factor": 1.0,
+ "initializer_range": 0.02,
+ "model_type": "blip-2",
+ "num_query_tokens": num_query_tokens,
+ "hidden_size": config.hidden_size,
+ "mm_hidden_size": config.mm_hidden_size,
+ "qformer_config": type('qformer_config', (PretrainedConfig,), {
+ "_name_or_path": "",
+ "add_cross_attention": False,
+ "architectures": None,
+ "attention_probs_dropout_prob": 0.0,
+ "bad_words_ids": None,
+ "begin_suppress_tokens": None,
+ "bos_token_id": None,
+ "chunk_size_feed_forward": 0,
+ "classifier_dropout": None,
+ "cross_attention_frequency": 1,
+ "cross_attention_hidden_size": None,
+ "decoder_start_token_id": None,
+ "diversity_penalty": 0.0,
+ "do_sample": False,
+ "early_stopping": False,
+ "encoder_hidden_size": config.mm_hidden_size,
+ "encoder_no_repeat_ngram_size": 0,
+ "eos_token_id": None,
+ "exponential_decay_length_penalty": None,
+ "finetuning_task": None,
+ "forced_bos_token_id": None,
+ "forced_eos_token_id": None,
+ "hidden_act": "gelu",
+ "hidden_dropout_prob": 0.0,
+ "hidden_size": config.mm_hidden_size,
+ "id2label": {
+ "0": "LABEL_0",
+ "1": "LABEL_1"
+ },
+ "initializer_range": 0.02,
+ "intermediate_size": config.mm_hidden_size * 4,
+ "is_decoder": False,
+ "is_encoder_decoder": False,
+ "label2id": {
+ "LABEL_0": 0,
+ "LABEL_1": 1
+ },
+ "layer_norm_eps": 1e-12,
+ "length_penalty": 1.0,
+ "max_length": 20,
+ "max_position_embeddings": 512,
+ "min_length": 0,
+ "model_type": "blip_2_qformer",
+ "no_repeat_ngram_size": 0,
+ "num_attention_heads": 32,
+ "num_beam_groups": 1,
+ "num_beams": 1,
+ "num_hidden_layers": num_hidden_layers,
+ "num_return_sequences": 1,
+ "output_attentions": False,
+ "output_hidden_states": False,
+ "output_scores": False,
+ "pad_token_id": 0,
+ "position_embedding_type": "absolute",
+ "prefix": None,
+ "problem_type": None,
+ "pruned_heads": {},
+ "remove_invalid_values": False,
+ "repetition_penalty": 1.0,
+ "return_dict": True,
+ "return_dict_in_generate": False,
+ "sep_token_id": None,
+ "suppress_tokens": None,
+ "task_specific_params": None,
+ "temperature": 1.0,
+ "tf_legacy_loss": False,
+ "tie_encoder_decoder": False,
+ "tie_word_embeddings": True,
+ "tokenizer_class": None,
+ "top_k": 50,
+ "top_p": 1.0,
+ "torch_dtype": None,
+ "torchscript": False,
+ "transformers_version": "4.27.0.dev0",
+ "typical_p": 1.0,
+ "use_bfloat16": False,
+ "vocab_size": 30522
+ })()
+ })()
+ return qformer_config
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+ if projector_type == 'linear':
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+ elif projector_type == 'identity':
+ return IdentityMap()
+
+ elif projector_type.startswith('qformer'): # qformer2_64
+ qformer_config = qformer_config_template(config, projector_type)
+ return Blip2Model(qformer_config)
+ else:
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+ if mlp_gelu_match:
+ mlp_depth = int(mlp_gelu_match.group(1))
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+ for _ in range(1, mlp_depth):
+ modules.append(nn.GELU())
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+ return nn.Sequential(*modules)
+
+ raise ValueError(f'Unknown projector type: {projector_type}')
\ No newline at end of file
diff --git a/models/quantize_cnn.py b/models/quantize_cnn.py
new file mode 100755
index 0000000..8cd3ecd
--- /dev/null
+++ b/models/quantize_cnn.py
@@ -0,0 +1,413 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantizeEMAReset(nn.Module):
+ def __init__(self, nb_code, code_dim, args):
+ super().__init__()
+ self.nb_code = nb_code
+ self.code_dim = code_dim
+ self.mu = args.mu
+ self.reset_codebook()
+
+ def reset_codebook(self):
+ self.init = False
+ self.code_sum = None
+ self.code_count = None
+ self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+
+ def _tile(self, x):
+ nb_code_x, code_dim = x.shape
+ if nb_code_x < self.nb_code:
+ n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+ std = 0.01 / np.sqrt(code_dim)
+ out = x.repeat(n_repeats, 1)
+ out = out + torch.randn_like(out) * std
+ else :
+ out = x
+ return out
+
+ def init_codebook(self, x):
+ out = self._tile(x)
+ self.codebook = out[:self.nb_code]
+ self.code_sum = self.codebook.clone()
+ self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+ self.init = True
+
+ @torch.no_grad()
+ def compute_perplexity(self, code_idx) :
+ # Calculate new centres
+ code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device) # nb_code, N * L
+ code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+
+ code_count = code_onehot.sum(dim=-1) # nb_code
+ prob = code_count / torch.sum(code_count)
+ perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+ return perplexity
+
+ @torch.no_grad()
+ def update_codebook(self, x, code_idx):
+
+ code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device) # nb_code, N * L
+ code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+
+ code_sum = torch.matmul(code_onehot, x) # nb_code, w
+ code_count = code_onehot.sum(dim=-1) # nb_code
+
+ out = self._tile(x)
+ code_rand = out[:self.nb_code]
+
+ # Update centres
+ self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum # w, nb_code
+ self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count # nb_code
+
+ usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+ code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+
+ self.codebook = usage * code_update + (1 - usage) * code_rand
+ prob = code_count / torch.sum(code_count)
+ perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+
+
+ return perplexity
+
+ def preprocess(self, x):
+ # NCT -> NTC -> [NT, C]
+ x = x.permute(0, 2, 1).contiguous()
+ x = x.view(-1, x.shape[-1])
+ return x
+
+ def quantize(self, x):
+ # Calculate latent code x_l
+ k_w = self.codebook.t()
+ distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+ keepdim=True) # (N * L, b)
+ _, code_idx = torch.min(distance, dim=-1)
+ return code_idx
+
+ def dequantize(self, code_idx):
+ x = F.embedding(code_idx, self.codebook)
+ return x
+
+
+ def forward(self, x):
+ N, width, T = x.shape
+
+ # Preprocess
+ x = self.preprocess(x)
+
+ # Init codebook if not inited
+ if self.training and not self.init:
+ self.init_codebook(x)
+
+ # quantize and dequantize through bottleneck
+ code_idx = self.quantize(x)
+ x_d = self.dequantize(code_idx)
+
+ # Update embeddings
+ if self.training:
+ perplexity = self.update_codebook(x, code_idx)
+ else :
+ perplexity = self.compute_perplexity(code_idx)
+
+ # Loss
+ commit_loss = F.mse_loss(x, x_d.detach())
+
+ # Passthrough
+ x_d = x + (x_d - x).detach()
+
+ # Postprocess
+ x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous() #(N, DIM, T)
+
+ return x_d, commit_loss, perplexity
+
+
+
+class Quantizer(nn.Module):
+ def __init__(self, n_e, e_dim, beta):
+ super(Quantizer, self).__init__()
+
+ self.e_dim = e_dim
+ self.n_e = n_e
+ self.beta = beta
+
+ self.embedding = nn.Embedding(self.n_e, self.e_dim)
+ self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+ def forward(self, z):
+
+ N, width, T = z.shape
+ z = self.preprocess(z)
+ assert z.shape[-1] == self.e_dim
+ z_flattened = z.contiguous().view(-1, self.e_dim)
+
+ # B x V
+ d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+ torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+ torch.matmul(z_flattened, self.embedding.weight.t())
+ # B x 1
+ min_encoding_indices = torch.argmin(d, dim=1)
+ z_q = self.embedding(min_encoding_indices).view(z.shape)
+
+ # compute loss for embedding
+ loss = torch.mean((z_q - z.detach())**2) + self.beta * \
+ torch.mean((z_q.detach() - z)**2)
+
+ # preserve gradients
+ z_q = z + (z_q - z).detach()
+ z_q = z_q.view(N, T, -1).permute(0, 2, 1).contiguous() #(N, DIM, T)
+
+ min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype)
+ e_mean = torch.mean(min_encodings, dim=0)
+ perplexity = torch.exp(-torch.sum(e_mean*torch.log(e_mean + 1e-10)))
+ return z_q, loss, perplexity
+
+ def quantize(self, z):
+
+ assert z.shape[-1] == self.e_dim
+
+ # B x V
+ d = torch.sum(z ** 2, dim=1, keepdim=True) + \
+ torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
+ torch.matmul(z, self.embedding.weight.t())
+ # B x 1
+ min_encoding_indices = torch.argmin(d, dim=1)
+ return min_encoding_indices
+
+ def dequantize(self, indices):
+
+ index_flattened = indices.view(-1)
+ z_q = self.embedding(index_flattened)
+ z_q = z_q.view(indices.shape + (self.e_dim, )).contiguous()
+ return z_q
+
+ def preprocess(self, x):
+ # NCT -> NTC -> [NT, C]
+ x = x.permute(0, 2, 1).contiguous()
+ x = x.view(-1, x.shape[-1])
+ return x
+
+
+
+class QuantizeReset(nn.Module):
+ def __init__(self, nb_code, code_dim, args):
+ super().__init__()
+ self.nb_code = nb_code
+ self.code_dim = code_dim
+ self.reset_codebook()
+ self.codebook = nn.Parameter(torch.randn(nb_code, code_dim))
+
+ def reset_codebook(self):
+ self.init = False
+ self.code_count = None
+
+ def _tile(self, x):
+ nb_code_x, code_dim = x.shape
+ if nb_code_x < self.nb_code:
+ n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+ std = 0.01 / np.sqrt(code_dim)
+ out = x.repeat(n_repeats, 1)
+ out = out + torch.randn_like(out) * std
+ else :
+ out = x
+ return out
+
+ def init_codebook(self, x):
+ out = self._tile(x)
+ self.codebook = nn.Parameter(out[:self.nb_code])
+ self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+ self.init = True
+
+ @torch.no_grad()
+ def compute_perplexity(self, code_idx) :
+ # Calculate new centres
+ code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device) # nb_code, N * L
+ code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+
+ code_count = code_onehot.sum(dim=-1) # nb_code
+ prob = code_count / torch.sum(code_count)
+ perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+ return perplexity
+
+ def update_codebook(self, x, code_idx):
+
+ code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device) # nb_code, N * L
+ code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+
+ code_count = code_onehot.sum(dim=-1) # nb_code
+
+ out = self._tile(x)
+ code_rand = out[:self.nb_code]
+
+ # Update centres
+ self.code_count = code_count # nb_code
+ usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+
+ self.codebook.data = usage * self.codebook.data + (1 - usage) * code_rand
+ prob = code_count / torch.sum(code_count)
+ perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+
+
+ return perplexity
+
+ def preprocess(self, x):
+ # NCT -> NTC -> [NT, C]
+ x = x.permute(0, 2, 1).contiguous()
+ x = x.view(-1, x.shape[-1])
+ return x
+
+ def quantize(self, x):
+ # Calculate latent code x_l
+ k_w = self.codebook.t()
+ distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+ keepdim=True) # (N * L, b)
+ _, code_idx = torch.min(distance, dim=-1)
+ return code_idx
+
+ def dequantize(self, code_idx):
+ x = F.embedding(code_idx, self.codebook)
+ return x
+
+
+ def forward(self, x):
+ N, width, T = x.shape
+ # Preprocess
+ x = self.preprocess(x)
+ # Init codebook if not inited
+ if self.training and not self.init:
+ self.init_codebook(x)
+ # quantize and dequantize through bottleneck
+ code_idx = self.quantize(x)
+ x_d = self.dequantize(code_idx)
+ # Update embeddings
+ if self.training:
+ perplexity = self.update_codebook(x, code_idx)
+ else :
+ perplexity = self.compute_perplexity(code_idx)
+
+ # Loss
+ commit_loss = F.mse_loss(x, x_d.detach())
+
+ # Passthrough
+ x_d = x + (x_d - x).detach()
+
+ # Postprocess
+ x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous() #(N, DIM, T)
+
+ return x_d, commit_loss, perplexity
+
+
+class QuantizeEMA(nn.Module):
+ def __init__(self, nb_code, code_dim, args):
+ super().__init__()
+ self.nb_code = nb_code
+ self.code_dim = code_dim
+ self.mu = 0.99
+ self.reset_codebook()
+
+ def reset_codebook(self):
+ self.init = False
+ self.code_sum = None
+ self.code_count = None
+ self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+
+ def _tile(self, x):
+ nb_code_x, code_dim = x.shape
+ if nb_code_x < self.nb_code:
+ n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+ std = 0.01 / np.sqrt(code_dim)
+ out = x.repeat(n_repeats, 1)
+ out = out + torch.randn_like(out) * std
+ else :
+ out = x
+ return out
+
+ def init_codebook(self, x):
+ out = self._tile(x)
+ self.codebook = out[:self.nb_code]
+ self.code_sum = self.codebook.clone()
+ self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+ self.init = True
+
+ @torch.no_grad()
+ def compute_perplexity(self, code_idx) :
+ # Calculate new centres
+ code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device) # nb_code, N * L
+ code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+
+ code_count = code_onehot.sum(dim=-1) # nb_code
+ prob = code_count / torch.sum(code_count)
+ perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+ return perplexity
+
+ @torch.no_grad()
+ def update_codebook(self, x, code_idx):
+
+ code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device) # nb_code, N * L
+ code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+
+ code_sum = torch.matmul(code_onehot, x) # nb_code, w
+ code_count = code_onehot.sum(dim=-1) # nb_code
+
+ # Update centres
+ self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum # w, nb_code
+ self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count # nb_code
+
+ code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+
+ self.codebook = code_update
+ prob = code_count / torch.sum(code_count)
+ perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+
+ return perplexity
+
+ def preprocess(self, x):
+ # NCT -> NTC -> [NT, C]
+ x = x.permute(0, 2, 1).contiguous()
+ x = x.view(-1, x.shape[-1])
+ return x
+
+ def quantize(self, x):
+ # Calculate latent code x_l
+ k_w = self.codebook.t()
+ distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+ keepdim=True) # (N * L, b)
+ _, code_idx = torch.min(distance, dim=-1)
+ return code_idx
+
+ def dequantize(self, code_idx):
+ x = F.embedding(code_idx, self.codebook)
+ return x
+
+
+ def forward(self, x):
+ N, width, T = x.shape
+
+ # Preprocess
+ x = self.preprocess(x)
+
+ # Init codebook if not inited
+ if self.training and not self.init:
+ self.init_codebook(x)
+
+ # quantize and dequantize through bottleneck
+ code_idx = self.quantize(x)
+ x_d = self.dequantize(code_idx)
+
+ # Update embeddings
+ if self.training:
+ perplexity = self.update_codebook(x, code_idx)
+ else :
+ perplexity = self.compute_perplexity(code_idx)
+
+ # Loss
+ commit_loss = F.mse_loss(x, x_d.detach())
+
+ # Passthrough
+ x_d = x + (x_d - x).detach()
+
+ # Postprocess
+ x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous() #(N, DIM, T)
+
+ return x_d, commit_loss, perplexity
diff --git a/models/resnet.py b/models/resnet.py
new file mode 100755
index 0000000..062346e
--- /dev/null
+++ b/models/resnet.py
@@ -0,0 +1,82 @@
+import torch.nn as nn
+import torch
+
+class nonlinearity(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x):
+ # swish
+ return x * torch.sigmoid(x)
+
+class ResConv1DBlock(nn.Module):
+ def __init__(self, n_in, n_state, dilation=1, activation='silu', norm=None, dropout=None):
+ super().__init__()
+ padding = dilation
+ self.norm = norm
+ if norm == "LN":
+ self.norm1 = nn.LayerNorm(n_in)
+ self.norm2 = nn.LayerNorm(n_in)
+ elif norm == "GN":
+ self.norm1 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+ self.norm2 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+ elif norm == "BN":
+ self.norm1 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+ self.norm2 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+
+ else:
+ self.norm1 = nn.Identity()
+ self.norm2 = nn.Identity()
+
+ if activation == "relu":
+ self.activation1 = nn.ReLU()
+ self.activation2 = nn.ReLU()
+
+ elif activation == "silu":
+ self.activation1 = nonlinearity()
+ self.activation2 = nonlinearity()
+
+ elif activation == "gelu":
+ self.activation1 = nn.GELU()
+ self.activation2 = nn.GELU()
+
+
+
+ self.conv1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
+ self.conv2 = nn.Conv1d(n_state, n_in, 1, 1, 0,)
+
+
+ def forward(self, x):
+ x_orig = x
+ if self.norm == "LN":
+ x = self.norm1(x.transpose(-2, -1))
+ x = self.activation1(x.transpose(-2, -1))
+ else:
+ x = self.norm1(x)
+ x = self.activation1(x)
+
+ x = self.conv1(x)
+
+ if self.norm == "LN":
+ x = self.norm2(x.transpose(-2, -1))
+ x = self.activation2(x.transpose(-2, -1))
+ else:
+ x = self.norm2(x)
+ x = self.activation2(x)
+
+ x = self.conv2(x)
+ x = x + x_orig
+ return x
+
+class Resnet1D(nn.Module):
+ def __init__(self, n_in, n_depth, dilation_growth_rate=1, reverse_dilation=True, activation='relu', norm=None):
+ super().__init__()
+
+ blocks = [ResConv1DBlock(n_in, n_in, dilation=dilation_growth_rate ** depth, activation=activation, norm=norm) for depth in range(n_depth)]
+ if reverse_dilation:
+ blocks = blocks[::-1]
+
+ self.model = nn.Sequential(*blocks)
+
+ def forward(self, x):
+ return self.model(x)
\ No newline at end of file
diff --git a/models/rotation2xyz.py b/models/rotation2xyz.py
new file mode 100755
index 0000000..44f6cb6
--- /dev/null
+++ b/models/rotation2xyz.py
@@ -0,0 +1,92 @@
+# This code is based on https://github.com/Mathux/ACTOR.git
+import torch
+import utils.rotation_conversions as geometry
+
+
+from models.smpl import SMPL, JOINTSTYPE_ROOT
+# from .get_model import JOINTSTYPES
+JOINTSTYPES = ["a2m", "a2mpl", "smpl", "vibe", "vertices"]
+
+
+class Rotation2xyz:
+ def __init__(self, device, dataset='amass'):
+ self.device = device
+ self.dataset = dataset
+ self.smpl_model = SMPL().eval().to(device)
+
+ def __call__(self, x, mask, pose_rep, translation, glob,
+ jointstype, vertstrans, betas=None, beta=0,
+ glob_rot=None, get_rotations_back=False, **kwargs):
+ if pose_rep == "xyz":
+ return x
+
+ if mask is None:
+ mask = torch.ones((x.shape[0], x.shape[-1]), dtype=bool, device=x.device)
+
+ if not glob and glob_rot is None:
+ raise TypeError("You must specify global rotation if glob is False")
+
+ if jointstype not in JOINTSTYPES:
+ raise NotImplementedError("This jointstype is not implemented.")
+
+ if translation:
+ x_translations = x[:, -1, :3]
+ x_rotations = x[:, :-1]
+ else:
+ x_rotations = x
+
+ x_rotations = x_rotations.permute(0, 3, 1, 2)
+ nsamples, time, njoints, feats = x_rotations.shape
+
+ # Compute rotations (convert only masked sequences output)
+ if pose_rep == "rotvec":
+ rotations = geometry.axis_angle_to_matrix(x_rotations[mask])
+ elif pose_rep == "rotmat":
+ rotations = x_rotations[mask].view(-1, njoints, 3, 3)
+ elif pose_rep == "rotquat":
+ rotations = geometry.quaternion_to_matrix(x_rotations[mask])
+ elif pose_rep == "rot6d":
+ rotations = geometry.rotation_6d_to_matrix(x_rotations[mask])
+ else:
+ raise NotImplementedError("No geometry for this one.")
+
+ if not glob:
+ global_orient = torch.tensor(glob_rot, device=x.device)
+ global_orient = geometry.axis_angle_to_matrix(global_orient).view(1, 1, 3, 3)
+ global_orient = global_orient.repeat(len(rotations), 1, 1, 1)
+ else:
+ global_orient = rotations[:, 0]
+ rotations = rotations[:, 1:]
+
+ if betas is None:
+ betas = torch.zeros([rotations.shape[0], self.smpl_model.num_betas],
+ dtype=rotations.dtype, device=rotations.device)
+ betas[:, 1] = beta
+ # import ipdb; ipdb.set_trace()
+ out = self.smpl_model(body_pose=rotations, global_orient=global_orient, betas=betas)
+
+ # get the desirable joints
+ joints = out[jointstype]
+
+ x_xyz = torch.empty(nsamples, time, joints.shape[1], 3, device=x.device, dtype=x.dtype)
+ x_xyz[~mask] = 0
+ x_xyz[mask] = joints
+
+ x_xyz = x_xyz.permute(0, 2, 3, 1).contiguous()
+
+ # the first translation root at the origin on the prediction
+ if jointstype != "vertices":
+ rootindex = JOINTSTYPE_ROOT[jointstype]
+ x_xyz = x_xyz - x_xyz[:, [rootindex], :, :]
+
+ if translation and vertstrans:
+ # the first translation root at the origin
+ x_translations = x_translations - x_translations[:, :, [0]]
+
+ # add the translation to all the joints
+ x_xyz = x_xyz + x_translations[:, None, :, :]
+
+ if get_rotations_back:
+ return x_xyz, rotations, global_orient
+ else:
+ return x_xyz
diff --git a/models/smpl.py b/models/smpl.py
new file mode 100755
index 0000000..587f541
--- /dev/null
+++ b/models/smpl.py
@@ -0,0 +1,97 @@
+# This code is based on https://github.com/Mathux/ACTOR.git
+import numpy as np
+import torch
+
+import contextlib
+
+from smplx import SMPLLayer as _SMPLLayer
+from smplx.lbs import vertices2joints
+
+
+# action2motion_joints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 24, 38]
+# change 0 and 8
+action2motion_joints = [8, 1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 21, 24, 38]
+
+from utils.config import SMPL_MODEL_PATH, JOINT_REGRESSOR_TRAIN_EXTRA
+
+JOINTSTYPE_ROOT = {"a2m": 0, # action2motion
+ "smpl": 0,
+ "a2mpl": 0, # set(smpl, a2m)
+ "vibe": 8} # 0 is the 8 position: OP MidHip below
+
+JOINT_MAP = {
+ 'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17,
+ 'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16,
+ 'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0,
+ 'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8,
+ 'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7,
+ 'OP REye': 25, 'OP LEye': 26, 'OP REar': 27,
+ 'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30,
+ 'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34,
+ 'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45,
+ 'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7,
+ 'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17,
+ 'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20,
+ 'Neck (LSP)': 47, 'Top of Head (LSP)': 48,
+ 'Pelvis (MPII)': 49, 'Thorax (MPII)': 50,
+ 'Spine (H36M)': 51, 'Jaw (H36M)': 52,
+ 'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26,
+ 'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27
+}
+
+JOINT_NAMES = [
+ 'OP Nose', 'OP Neck', 'OP RShoulder',
+ 'OP RElbow', 'OP RWrist', 'OP LShoulder',
+ 'OP LElbow', 'OP LWrist', 'OP MidHip',
+ 'OP RHip', 'OP RKnee', 'OP RAnkle',
+ 'OP LHip', 'OP LKnee', 'OP LAnkle',
+ 'OP REye', 'OP LEye', 'OP REar',
+ 'OP LEar', 'OP LBigToe', 'OP LSmallToe',
+ 'OP LHeel', 'OP RBigToe', 'OP RSmallToe', 'OP RHeel',
+ 'Right Ankle', 'Right Knee', 'Right Hip',
+ 'Left Hip', 'Left Knee', 'Left Ankle',
+ 'Right Wrist', 'Right Elbow', 'Right Shoulder',
+ 'Left Shoulder', 'Left Elbow', 'Left Wrist',
+ 'Neck (LSP)', 'Top of Head (LSP)',
+ 'Pelvis (MPII)', 'Thorax (MPII)',
+ 'Spine (H36M)', 'Jaw (H36M)',
+ 'Head (H36M)', 'Nose', 'Left Eye',
+ 'Right Eye', 'Left Ear', 'Right Ear'
+]
+
+
+# adapted from VIBE/SPIN to output smpl_joints, vibe joints and action2motion joints
+class SMPL(_SMPLLayer):
+ """ Extension of the official SMPL implementation to support more joints """
+
+ def __init__(self, model_path=SMPL_MODEL_PATH, **kwargs):
+ kwargs["model_path"] = model_path
+
+ # remove the verbosity for the 10-shapes beta parameters
+ with contextlib.redirect_stdout(None):
+ super(SMPL, self).__init__(**kwargs)
+
+ J_regressor_extra = np.load(JOINT_REGRESSOR_TRAIN_EXTRA)
+ self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32))
+ vibe_indexes = np.array([JOINT_MAP[i] for i in JOINT_NAMES])
+ a2m_indexes = vibe_indexes[action2motion_joints]
+ smpl_indexes = np.arange(24)
+ a2mpl_indexes = np.unique(np.r_[smpl_indexes, a2m_indexes])
+
+ self.maps = {"vibe": vibe_indexes,
+ "a2m": a2m_indexes,
+ "smpl": smpl_indexes,
+ "a2mpl": a2mpl_indexes}
+
+ def forward(self, *args, **kwargs):
+ smpl_output = super(SMPL, self).forward(*args, **kwargs)
+
+ extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices)
+ all_joints = torch.cat([smpl_output.joints, extra_joints], dim=1)
+
+ output = {"vertices": smpl_output.vertices}
+
+ for joinstype, indexes in self.maps.items():
+ output[joinstype] = all_joints[:, indexes]
+
+ return output
\ No newline at end of file
diff --git a/models/vqvae.py b/models/vqvae.py
new file mode 100755
index 0000000..a01a747
--- /dev/null
+++ b/models/vqvae.py
@@ -0,0 +1,134 @@
+# This code is based on https://github.com/Mael-zys/T2M-GPT.git
+import torch.nn as nn
+from models.encdec import Encoder, Decoder
+from models.quantize_cnn import QuantizeEMAReset, Quantizer, QuantizeEMA, QuantizeReset
+
+
+class VQVAE_251(nn.Module):
+ def __init__(self,
+ args,
+ nb_code=1024,
+ code_dim=512,
+ output_emb_width=512,
+ down_t=3,
+ stride_t=2,
+ width=512,
+ depth=3,
+ dilation_growth_rate=3,
+ activation='relu',
+ norm=None):
+
+ super().__init__()
+ self.code_dim = code_dim
+ self.num_code = nb_code
+ self.quant = args.quantizer
+ self.encoder = Encoder(251 if args.dataname == 'kit' else 263, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+ self.decoder = Decoder(251 if args.dataname == 'kit' else 263, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+ if args.quantizer == "ema_reset":
+ self.quantizer = QuantizeEMAReset(nb_code, code_dim, args)
+ elif args.quantizer == "orig":
+ self.quantizer = Quantizer(nb_code, code_dim, 1.0)
+ elif args.quantizer == "ema":
+ self.quantizer = QuantizeEMA(nb_code, code_dim, args)
+ elif args.quantizer == "reset":
+ self.quantizer = QuantizeReset(nb_code, code_dim, args)
+
+
+ def preprocess(self, x):
+ # (bs, T, Jx3) -> (bs, Jx3, T)
+ x = x.permute(0,2,1).float()
+ return x
+
+
+ def postprocess(self, x):
+ # (bs, Jx3, T) -> (bs, T, Jx3)
+ x = x.permute(0,2,1)
+ return x
+
+
+ def encode(self, x):
+ N, T, _ = x.shape
+ x_in = self.preprocess(x)
+ x_encoder = self.encoder(x_in)
+ # import pdb; pdb.set_trace()
+ x_encoder = self.postprocess(x_encoder)
+ x_encoder = x_encoder.contiguous().view(-1, x_encoder.shape[-1]) # (NT, C)
+ code_idx = self.quantizer.quantize(x_encoder)
+ code_idx = code_idx.view(N, -1)
+ return code_idx
+
+
+ def encode_x(self, x):
+ N, T, _ = x.shape
+ x_in = self.preprocess(x)
+ x_encoder = self.encoder(x_in)
+ # import pdb; pdb.set_trace()
+ x_encoder = self.postprocess(x_encoder)
+ x_encoder = x_encoder.contiguous().view(-1, x_encoder.shape[-1]) # (NT, C)
+ return x_encoder # (B*T, 512)
+
+ def forward(self, x):
+
+ x_in = self.preprocess(x)
+ # Encode
+ x_encoder = self.encoder(x_in)
+
+ ## quantization
+ x_quantized, loss, perplexity = self.quantizer(x_encoder)
+
+ ## decoder
+ x_decoder = self.decoder(x_quantized)
+ x_out = self.postprocess(x_decoder)
+ return x_out, loss, perplexity
+
+
+ def forward_decoder(self, x):
+ x_d = self.quantizer.dequantize(x)
+ x_d = x_d.view(1, -1, self.code_dim).permute(0, 2, 1).contiguous()
+
+ # decoder
+ x_decoder = self.decoder(x_d)
+ x_out = self.postprocess(x_decoder)
+ return x_out
+
+
+
+class HumanVQVAE(nn.Module):
+ def __init__(self,
+ args,
+ nb_code=512,
+ code_dim=512,
+ output_emb_width=512,
+ down_t=3,
+ stride_t=2,
+ width=512,
+ depth=3,
+ dilation_growth_rate=3,
+ activation='relu',
+ norm=None):
+
+ super().__init__()
+
+ self.nb_joints = 21 if args.dataname == 'kit' else 22
+ self.vqvae = VQVAE_251(args, nb_code, code_dim, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+
+ def encode(self, x):
+ b, t, c = x.size()
+ quants = self.vqvae.encode(x) # (N, T)
+ return quants
+
+ def encode_x(self, x):
+ b, t, c = x.size()
+ quants = self.vqvae.encode_x(x) # (N, T)
+ return quants
+
+ def forward(self, x):
+
+ x_out, loss, perplexity = self.vqvae(x)
+
+ return x_out, loss, perplexity
+
+ def forward_decoder(self, x):
+ x_out = self.vqvae.forward_decoder(x)
+ return x_out
+
\ No newline at end of file
diff --git a/options/get_eval_option.py b/options/get_eval_option.py
new file mode 100755
index 0000000..d0989ba
--- /dev/null
+++ b/options/get_eval_option.py
@@ -0,0 +1,83 @@
+from argparse import Namespace
+import re
+from os.path import join as pjoin
+
+
+def is_float(numStr):
+ flag = False
+ numStr = str(numStr).strip().lstrip('-').lstrip('+')
+ try:
+ reg = re.compile(r'^[-+]?[0-9]+\.[0-9]+$')
+ res = reg.match(str(numStr))
+ if res:
+ flag = True
+ except Exception as ex:
+ print("is_float() - error: " + str(ex))
+ return flag
+
+
+def is_number(numStr):
+ flag = False
+ numStr = str(numStr).strip().lstrip('-').lstrip('+')
+ if str(numStr).isdigit():
+ flag = True
+ return flag
+
+
+def get_opt(opt_path, device):
+ opt = Namespace()
+ opt_dict = vars(opt)
+
+ skip = ('-------------- End ----------------',
+ '------------ Options -------------',
+ '\n')
+ print('Reading', opt_path)
+ with open(opt_path) as f:
+ for line in f:
+ if line.strip() not in skip:
+ # print(line.strip())
+ key, value = line.strip().split(': ')
+ if value in ('True', 'False'):
+ opt_dict[key] = (value == 'True')
+ # print(key, value)
+ elif is_float(value):
+ opt_dict[key] = float(value)
+ elif is_number(value):
+ opt_dict[key] = int(value)
+ else:
+ opt_dict[key] = str(value)
+
+ # print(opt)
+ opt_dict['which_epoch'] = 'finest'
+ opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name)
+ opt.model_dir = pjoin(opt.save_root, 'model')
+ opt.meta_dir = pjoin(opt.save_root, 'meta')
+
+ if opt.dataset_name == 't2m':
+ opt.data_root = './dataset/HumanML3D/'
+ opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
+ opt.text_dir = pjoin(opt.data_root, 'texts')
+ opt.joints_num = 22
+ opt.dim_pose = 263
+ opt.max_motion_length = 196
+ opt.max_motion_frame = 196
+ opt.max_motion_token = 55
+ elif opt.dataset_name == 'kit':
+ opt.data_root = './dataset/KIT-ML/'
+ opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
+ opt.text_dir = pjoin(opt.data_root, 'texts')
+ opt.joints_num = 21
+ opt.dim_pose = 251
+ opt.max_motion_length = 196
+ opt.max_motion_frame = 196
+ opt.max_motion_token = 55
+ else:
+ raise KeyError('Dataset not recognized')
+
+ opt.dim_word = 300
+ opt.num_classes = 200 // opt.unit_length
+ opt.is_train = False
+ opt.is_continue = False
+ opt.device = device
+
+ return opt
\ No newline at end of file
diff --git a/options/option.py b/options/option.py
new file mode 100755
index 0000000..c7909bb
--- /dev/null
+++ b/options/option.py
@@ -0,0 +1,84 @@
+import argparse
+
+def get_args_parser():
+ parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+ add_help=True,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ ## dataloader
+ parser.add_argument('--prompt', type=str, default="Generate a textual description corresponding to the given sequence of human motion tokens.", help='task description')
+ parser.add_argument('--input', type=str, help='generation condictions')
+ parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+ parser.add_argument('--pretrained_llama', type=str, default="13B")
+ parser.add_argument('--out_dir', type=str, default='./out/', help='output directory')
+ parser.add_argument('--vqvae_pth', type=str, default='/comp_robot/lushunlin/MotionGPT/checkpoints/pretrained_vqvae/t2m.pth', help='path to the pretrained vqvae pth')
+ parser.add_argument('--resume_pth', type=str, help='path to saved finetuned model')
+ parser.add_argument('--lora_path', type=str, help='path to fintuned model for evaluation')
+ parser.add_argument('--mlp_path', type=str, help='mlp path')
+ parser.add_argument('--data_dir', type=str, default='./data/', help='dataset directory')
+
+
+ ## lora
+ parser.add_argument('--lora_r', type=int, default=64)
+ parser.add_argument('--lora_alpha', type=int, default=16)
+ parser.add_argument('--lora_dropout', type=float, default=0.05)
+
+ ## llama
+ parser.add_argument('--block_size', type=int, default=512)
+
+ ## train
+ parser.add_argument('--batch_size', type=int, default=256, help='batch size')
+ parser.add_argument('--micro_batch_size', type=int, default=4, help='micro batch size')
+ # parser.add_argument('--learning_rate', type=float, default=3e-3, help='learning rate')
+ parser.add_argument('--learning_rate_lora', type=float, default=3e-3, help='learning rate of lora')
+ parser.add_argument('--learning_rate_mlp', type=float, default=3e-3, help='learning rate of mlp')
+ parser.add_argument('--weight_decay', type=float, default=0.01, help='weight decay')
+ parser.add_argument('--warmup_steps', type=int, default=100, help='warmup steps')
+ parser.add_argument('--eval_interval', type=int, default=100, help='evaluation frequency')
+ parser.add_argument('--save_interval', type=int, default=100, help='model save frequency')
+ parser.add_argument('--eval_iters', type=int, default=100, help='number of evaluation ierations')
+ parser.add_argument('--log_interval', type=int, default=1, help='log frequency')
+
+ ## vqvae
+ parser.add_argument("--code_dim", type=int, default=512, help="embedding dimension")
+ parser.add_argument("--nb_code", type=int, default=512, help="nb of embedding")
+ parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+ parser.add_argument("--down_t", type=int, default=2, help="downsampling rate")
+ parser.add_argument("--stride_t", type=int, default=2, help="stride size")
+ parser.add_argument("--width", type=int, default=512, help="width of the network")
+ parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+ parser.add_argument("--dilation_growth_rate", type=int, default=3, help="dilation growth rate")
+ parser.add_argument("--output_emb_width", type=int, default=512, help="output embedding width")
+ parser.add_argument('--vq_act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+ parser.add_argument('--seed', default=123, type=int, help='seed for initializing vqvae training.')
+ parser.add_argument('--window_size', type=int, default=64, help='training motion length')
+
+ ## quantizer
+ parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+ parser.add_argument('--quantbeta', type=float, default=1.0, help='dataset directory')
+
+ ## visualization
+ parser.add_argument("--render", action='store_true', help='render smpl')
+ parser.add_argument("--motion_vq_token_path", type=str, help='vq token path for motion visualization')
+
+
+ ## for motionx zero shot
+ parser.add_argument('--motionx_zero_shot_path', type=str, help='zero shot motion dataset directory')
+
+ parser.add_argument("--projectionnn", action='store_true', help='MLP projection')
+ parser.add_argument("--diverse", action='store_true', help='diverse description')
+ parser.add_argument("--vinilla", action='store_true', help='vinilla motion')
+
+
+ # for video llava
+ parser.add_argument('--image_tower', type=str, default='LanguageBind/LanguageBind_Image', help='if use multimodal image tower')
+ parser.add_argument('--video_tower', type=str, default='LanguageBind/LanguageBind_Video_merge', help='if use multimodal video tower')
+ parser.add_argument('--mm_vision_select_layer', type=int, default=-2, help='if use multimodal video tower')
+ parser.add_argument('--mm_projector_type', type=str, default='mlp2x_gelu', help='if use multimodal video tower')
+ parser.add_argument('--mm_hidden_size', type=int, default=1024, help='if use multimodal video tower')
+ parser.add_argument('--hidden_size', type=int, default=4096, help='if use multimodal video tower')
+
+ # for mvbench save
+ parser.add_argument('--model_type', type=str, default=None, help='if use multimodal video tower')
+
+ return parser.parse_args()
diff --git a/options/option_video.py b/options/option_video.py
new file mode 100755
index 0000000..3770388
--- /dev/null
+++ b/options/option_video.py
@@ -0,0 +1,80 @@
+import argparse
+
+def get_args_parser():
+ parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+ add_help=True,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ ## dataloader
+ parser.add_argument('--prompt', type=str, default="Generate a textual description corresponding to the given sequence of human motion tokens.", help='task description')
+ parser.add_argument('--input', type=str, help='generation condictions')
+ parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+ parser.add_argument('--pretrained_llama', type=str, default="13B")
+ parser.add_argument('--out_dir', type=str, default='./out/', help='output directory')
+ parser.add_argument('--vqvae_pth', type=str, default='/comp_robot/lushunlin/MotionGPT/checkpoints/pretrained_vqvae/t2m.pth', help='path to the pretrained vqvae pth')
+ parser.add_argument('--resume_pth', type=str, help='path to saved finetuned model')
+ parser.add_argument('--lora_path', type=str, help='path to fintuned model for evaluation')
+ parser.add_argument('--data_dir', type=str, default='./data/', help='dataset directory')
+
+ ## lora
+ parser.add_argument('--lora_r', type=int, default=64)
+ parser.add_argument('--lora_alpha', type=int, default=16)
+ parser.add_argument('--lora_dropout', type=float, default=0.05)
+
+ ## llama
+ parser.add_argument('--block_size', type=int, default=512)
+
+ ## train
+ parser.add_argument('--batch_size', type=int, default=256, help='batch size')
+ parser.add_argument('--micro_batch_size', type=int, default=4, help='micro batch size')
+ parser.add_argument('--learning_rate', type=float, default=3e-3, help='learning rate')
+ parser.add_argument('--weight_decay', type=float, default=0.01, help='weight decay')
+ parser.add_argument('--warmup_steps', type=int, default=100, help='warmup steps')
+ parser.add_argument('--eval_interval', type=int, default=100, help='evaluation frequency')
+ parser.add_argument('--save_interval', type=int, default=100, help='model save frequency')
+ parser.add_argument('--eval_iters', type=int, default=100, help='number of evaluation ierations')
+ parser.add_argument('--log_interval', type=int, default=1, help='log frequency')
+
+ ## vqvae
+ parser.add_argument("--code_dim", type=int, default=512, help="embedding dimension")
+ parser.add_argument("--nb_code", type=int, default=512, help="nb of embedding")
+ parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+ parser.add_argument("--down_t", type=int, default=2, help="downsampling rate")
+ parser.add_argument("--stride_t", type=int, default=2, help="stride size")
+ parser.add_argument("--width", type=int, default=512, help="width of the network")
+ parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+ parser.add_argument("--dilation_growth_rate", type=int, default=3, help="dilation growth rate")
+ parser.add_argument("--output_emb_width", type=int, default=512, help="output embedding width")
+ parser.add_argument('--vq_act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+ parser.add_argument('--seed', default=123, type=int, help='seed for initializing vqvae training.')
+ parser.add_argument('--window_size', type=int, default=64, help='training motion length')
+
+ ## quantizer
+ parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+ parser.add_argument('--quantbeta', type=float, default=1.0, help='dataset directory')
+
+ ## visualization
+ parser.add_argument("--render", action='store_true', help='render smpl')
+ parser.add_argument("--motion_vq_token_path", type=str, help='vq token path for motion visualization')
+
+
+ ## for motionx zero shot
+ parser.add_argument('--motionx_zero_shot_path', type=str, help='zero shot motion dataset directory')
+
+ parser.add_argument("--projectionnn", action='store_true', help='MLP projection')
+ parser.add_argument("--diverse", action='store_true', help='diverse description')
+ parser.add_argument("--vinilla", action='store_true', help='vinilla motion')
+
+ # subparsers = parser.add_subparsers(help='sub-command help')
+ # model_subparser = subparsers.add_parser('model_config', help='subparser1 help')
+ parser.add_argument('--image_tower', type=str, default='LanguageBind/LanguageBind_Image', help='if use multimodal image tower')
+ parser.add_argument('--video_tower', type=str, default='LanguageBind/LanguageBind_Video_merge', help='if use multimodal video tower')
+ parser.add_argument('--mm_vision_select_layer', type=int, default=-2, help='if use multimodal video tower')
+ parser.add_argument('--mm_projector_type', type=str, default='mlp2x_gelu', help='if use multimodal video tower')
+ parser.add_argument('--mm_hidden_size', type=int, default=1024, help='if use multimodal video tower')
+ parser.add_argument('--hidden_size', type=int, default=4096, help='if use multimodal video tower')
+
+
+
+
+ return parser.parse_args()
diff --git a/options/option_video_model.py b/options/option_video_model.py
new file mode 100755
index 0000000..a07892c
--- /dev/null
+++ b/options/option_video_model.py
@@ -0,0 +1,11 @@
+import argparse
+
+def get_args_parser():
+ parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+ add_help=True,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('mm_image_tower', action='store_true', default=True, help='if use multimodal image tower')
+ parser.add_argument('mm_video_tower', action='store_true', default=True, help='if use multimodal video tower')
+
+ return parser.parse_args()
diff --git a/options/option_vqvae.py b/options/option_vqvae.py
new file mode 100755
index 0000000..244dcdf
--- /dev/null
+++ b/options/option_vqvae.py
@@ -0,0 +1,47 @@
+import argparse
+
+def get_args_parser():
+ parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+ add_help=True,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ ## dataloader
+ parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+ parser.add_argument('--out_dir', type=str, default='./out/', help='output directory')
+ parser.add_argument('--resume_pth', type=str, help='path to saved vqvae model')
+ parser.add_argument('--window_size', type=int, default=64, help='training motion length')
+
+ ## train
+ parser.add_argument('--batch_size', type=int, default=256, help='batch size')
+ parser.add_argument('--learning_rate', type=float, default=2e-4, help='learning rate')
+ parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
+ parser.add_argument('--warmup_steps', type=int, default=1000, help='number of total iterations for warmup')
+ parser.add_argument('--total_iter', default=300000, type=int, help='number of total iterations to run')
+ parser.add_argument('--lr', default=2e-4, type=float, help='max learning rate')
+ parser.add_argument('--lr_scheduler', default=[200000], nargs="+", type=int, help="learning rate schedule (iterations)")
+ parser.add_argument('--gamma', default=0.05, type=float, help="learning rate decay")
+ parser.add_argument("--commit", type=float, default=0.02, help="hyper-parameter for the commitment loss")
+ parser.add_argument('--loss_vel', type=float, default=0.5, help='hyper-parameter for the velocity loss')
+ parser.add_argument('--recons_loss', type=str, default='l1_smooth', help='reconstruction loss')
+ parser.add_argument('--print_iter', default=200, type=int, help='print frequency')
+ parser.add_argument('--eval_iter', default=1000, type=int, help='evaluation frequency')
+ parser.add_argument('--seed', default=123, type=int, help='seed for initializing training.')
+
+ ## model
+ parser.add_argument("--code_dim", type=int, default=512, help="embedding dimension")
+ parser.add_argument("--nb_code", type=int, default=512, help="nb of embedding")
+ parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+ parser.add_argument("--down_t", type=int, default=2, help="downsampling rate")
+ parser.add_argument("--stride_t", type=int, default=2, help="stride size")
+ parser.add_argument("--width", type=int, default=512, help="width of the network")
+ parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+ parser.add_argument("--dilation_growth_rate", type=int, default=3, help="dilation growth rate")
+ parser.add_argument("--output_emb_width", type=int, default=512, help="output embedding width")
+ parser.add_argument('--vq_act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+ parser.add_argument('--vq_norm', type=str, default=None, help='dataset directory')
+
+ ## quantizer
+ parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+ parser.add_argument('--beta', type=float, default=1.0, help='commitment loss in standard VQ')
+
+ return parser.parse_args()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..424e68f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,48 @@
+bitsandbytes==0.41.3.post2
+dataloader==2.0
+decord==0.6.0
+deepspeed==0.9.5
+editdistance==0.8.1
+einops==0.8.0
+fastapi==0.111.0
+fire==0.5.0
+flash_attn==2.4.2
+gradio==4.31.5
+huggingface_hub==0.22.2
+imageio==2.13.5
+jsonargparse==4.26.1
+lightning==2.2.0rc0
+lightning_utilities==0.9.0
+matplotlib==3.5.1
+nlg_metricverse==0.9.9
+numpy==1.23.0
+numpy==1.20.0
+openai==0.28.0
+opencv_python==4.5.5.64
+packaging==21.3
+pandas==1.3.4
+peft==0.8.2
+Pillow==9.0.0
+pycocoevalcap==1.2
+pyrender==0.1.45
+pytorchvideo==0.1.5
+quantize==0.0.4
+ray==2.23.0
+Requests==2.32.2
+scipy==1.13.1
+sentence_transformers==2.2.2
+sentencepiece==0.1.99
+Shapely==2.0.4
+shortuuid==1.0.13
+smplx==0.1.26
+tokenizers==0.13.3
+torch==2.0.0
+torch_xla==2.3.0
+torchvision==0.15.1+cu117
+tqdm==4.66.1
+transformers==4.28.1
+trimesh==3.22.1
+typing_extensions==4.12.0
+uvicorn==0.30.0
+visualize==0.5.1
+xformers==0.0.22
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/video_dataset/prepare_video_dataset_intern_video.py b/scripts/video_dataset/prepare_video_dataset_intern_video.py
new file mode 100755
index 0000000..c60ae3f
--- /dev/null
+++ b/scripts/video_dataset/prepare_video_dataset_intern_video.py
@@ -0,0 +1,155 @@
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import os
+import sys
+from pathlib import Path
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import torch
+import requests
+import json
+from torch.utils.data import random_split
+sys.path.append(os.getcwd())
+from lit_llama.tokenizer import Tokenizer
+from tqdm import tqdm
+import numpy as np
+
+from options import option
+
+IGNORE_INDEX = -1
+
+def prepare(
+ destination_path: Path = Path("./data"),
+ tokenizer_path: Path = Path("./checkpoints/lit-llama/tokenizer.model"),
+ max_seq_length: int = 2560,
+ seed: int = 42,
+ mask_inputs: bool = False, # as in alpaca-lora
+ split: str = "train"
+):
+ """Prepare the Alpaca dataset for instruction tuning.
+ The output is a training and validation dataset saved as `train.pt` and `val.pt`,
+ which stores the preprocessed and tokenized prompts and labels.
+ """
+
+ destination_path.mkdir(parents=True, exist_ok=True)
+
+ file_path = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/{split}.json'
+
+ # TODO: If we don't have the Meta weights, where do we get the tokenizer from?
+ tokenizer = Tokenizer(tokenizer_path)
+
+ with open(file_path, "r") as file:
+ data = json.load(file)
+ data_set = list(data)
+
+ print(f"{split} set has {len(data_set):,} samples")
+
+ print(f"Processing {split} split ...")
+ data_set_new = []
+ for sample in tqdm(data_set):
+ # try:
+ data_set_new.append(prepare_sample(sample, tokenizer, max_seq_length, mask_inputs))
+ # import pdb; pdb.set_trace()
+
+ data_set = data_set_new
+
+ save_pt = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/{split}.pt'
+ torch.save(data_set, save_pt)
+
+
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
+ """Processes a single sample.
+ Each sample in the dataset consists of:
+ - instruction: A string describing the task
+ - input: A string holding a special input value for the instruction.
+ This only applies to some samples, and in others this is empty.
+ - output: The response string
+
+ This function processes this data to produce a prompt text and a label for
+ supervised training. The prompt text is formed as a single message including both
+ the instruction and the input. The label/target is the same message but with the
+ response attached.
+
+ Finally, both the prompt and the label get tokenized. If desired, all tokens
+ in the label that correspond to the original input prompt get masked out (default).
+
+
+ """
+ # import pdb; pdb.set_trace()
+ # full_prompt = generate_prompt(example)
+ # import pdb; pdb.set_trace()
+ full_prompt = generate_prompt_mlp(example)
+ full_prompt_and_response = full_prompt + example['output']
+ # import pdb; pdb.set_trace()
+ encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
+ encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)
+
+ # extendedQA = example['QA'][1:]
+ # for qa_item in extendedQA:
+ # q, a = qa_item["Q"], qa_item["A"]
+ # new_concat = "USER: " + q + "ASSISTANT: " + a
+ # full_prompt_and_response = full_prompt_and_response + new_concat
+ # encoded_new_concat = tokenize(tokenizer, new_concat, eos=True, max_length=max_length)
+ # encoded_full_prompt_and_response = torch.cat((encoded_full_prompt_and_response, encoded_new_concat))
+
+
+ # The labels are the full prompt with response, but with the prompt masked out
+ labels = encoded_full_prompt_and_response.clone()
+ if mask_inputs:
+ labels[:len(encoded_full_prompt)] = IGNORE_INDEX
+
+ # import pdb; pdb.set_trace()
+
+ return {**example, "sys_command": generate_system_command(), "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}
+
+
+def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
+ return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)
+
+def detokenizer(tokenizer: Tokenizer, tensor: torch.Tensor):
+ '''
+ tokenizer.decode(torch.tensor([13866, 338]))
+ '''
+ return tokenizer.decode(tensor)
+
+
+def generate_prompt_mlp(example):
+ """Generates a standardized message to prompt the model with an instruction, optional input and a
+ 'response' field."""
+ # import pdb; pdb.set_trace()
+ # try:
+ # x = f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['QA'][0]['Q']} INPUT_MOTION_TOKENS: {example['input']}. \nASSISTANT: "
+ # except:
+ # import pdb; pdb.set_trace()
+ if example["input"]:
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} INPUT_VIDEO: {example['input']}. \nASSISTANT: "
+ )
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} ASSISTANT: "
+ )
+
+ # return (
+ # "Below is an instruction that describes a task, paired with an input that provides further context. "
+ # "Write a response that appropriately completes the request.\n\n"
+ # f"### Instruction:\n{example['instruction']}\n\n### Input:\n", "\n\n### Response:"
+ # )
+
+def generate_system_command():
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. "
+ )
+
+
+def main():
+ args = option.get_args_parser()
+ # prepare(split='train')
+ # prepare(split='val')
+ prepare(split='train_intern_human_2M_stage1_caption')
+ prepare(split='val_intern_human_2M_stage1_caption')
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/video_dataset/prepare_video_dataset_video_llava.py b/scripts/video_dataset/prepare_video_dataset_video_llava.py
new file mode 100755
index 0000000..05248e5
--- /dev/null
+++ b/scripts/video_dataset/prepare_video_dataset_video_llava.py
@@ -0,0 +1,178 @@
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import os
+import sys
+from pathlib import Path
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import torch
+import requests
+import json
+from torch.utils.data import random_split
+sys.path.append(os.getcwd())
+from lit_llama.tokenizer import Tokenizer
+from tqdm import tqdm
+import numpy as np
+
+from options import option
+
+IGNORE_INDEX = -1
+
+def prepare(
+ destination_path: Path = Path("./data"),
+ tokenizer_path: Path = Path("./checkpoints/lit-llama/tokenizer.model"),
+ max_seq_length: int = 2560,
+ seed: int = 42,
+ mask_inputs: bool = False, # as in alpaca-lora
+ split: str = "train"
+):
+ """Prepare the Alpaca dataset for instruction tuning.
+ The output is a training and validation dataset saved as `train.pt` and `val.pt`,
+ which stores the preprocessed and tokenized prompts and labels.
+ """
+
+ destination_path.mkdir(parents=True, exist_ok=True)
+
+ file_path = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/video_llava_{split}.json'
+
+ # TODO: If we don't have the Meta weights, where do we get the tokenizer from?
+ tokenizer = Tokenizer(tokenizer_path)
+
+ with open(file_path, "r") as file:
+ data = json.load(file)
+ data_set = list(data)
+
+ print(f"{split} set has {len(data_set):,} samples")
+
+ print(f"Processing {split} split ...")
+ data_set_new = []
+ for sample in tqdm(data_set):
+ # try:
+ data_set_new.append(prepare_sample(sample, tokenizer, max_seq_length, mask_inputs))
+ # import pdb; pdb.set_trace()
+
+ data_set = data_set_new
+
+ save_pt = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/video_llava_{split}.pt'
+ torch.save(data_set, save_pt)
+
+
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
+ """Processes a single sample.
+ Each sample in the dataset consists of:
+ - instruction: A string describing the task
+ - input: A string holding a special input value for the instruction.
+ This only applies to some samples, and in others this is empty.
+ - output: The response string
+
+ This function processes this data to produce a prompt text and a label for
+ supervised training. The prompt text is formed as a single message including both
+ the instruction and the input. The label/target is the same message but with the
+ response attached.
+
+ Finally, both the prompt and the label get tokenized. If desired, all tokens
+ in the label that correspond to the original input prompt get masked out (default).
+
+
+ """
+ # import pdb; pdb.set_trace()
+ # full_prompt = generate_prompt(example)
+ # import pdb; pdb.set_trace()
+ full_prompt = generate_prompt_mlp(example)
+ full_prompt_and_response = full_prompt + example['output']
+
+ encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
+ encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)
+
+ # extendedQA = example['QA'][1:]
+ # for qa_item in extendedQA:
+ # q, a = qa_item["Q"], qa_item["A"]
+ # new_concat = "USER: " + q + "ASSISTANT: " + a
+ # full_prompt_and_response = full_prompt_and_response + new_concat
+ # encoded_new_concat = tokenize(tokenizer, new_concat, eos=True, max_length=max_length)
+ # encoded_full_prompt_and_response = torch.cat((encoded_full_prompt_and_response, encoded_new_concat))
+
+
+ # The labels are the full prompt with response, but with the prompt masked out
+ labels = encoded_full_prompt_and_response.clone()
+ if mask_inputs:
+ labels[:len(encoded_full_prompt)] = IGNORE_INDEX
+
+ # import pdb; pdb.set_trace()
+
+ return {**example, "sys_command": generate_system_command(), "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}
+
+
+def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
+ return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)
+
+def detokenizer(tokenizer: Tokenizer, tensor: torch.Tensor):
+ '''
+ tokenizer.decode(torch.tensor([13866, 338]))
+ '''
+ return tokenizer.decode(tensor)
+
+
+def generate_prompt_mlp(example):
+ """Generates a standardized message to prompt the model with an instruction, optional input and a
+ 'response' field."""
+ # import pdb; pdb.set_trace()
+ # try:
+ # x = f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['QA'][0]['Q']} INPUT_MOTION_TOKENS: {example['input']}. \nASSISTANT: "
+ # except:
+ # import pdb; pdb.set_trace()
+ if example["input"]:
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} INPUT_VIDEO: {example['input']}. \nASSISTANT: "
+ )
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} ASSISTANT: "
+ )
+
+ # return (
+ # "Below is an instruction that describes a task, paired with an input that provides further context. "
+ # "Write a response that appropriately completes the request.\n\n"
+ # f"### Instruction:\n{example['instruction']}\n\n### Input:\n", "\n\n### Response:"
+ # )
+
+def generate_prompt_mlp_mv_bench(example):
+ """Generates a standardized message to prompt the model with an instruction, optional input and a
+ 'response' field."""
+ # import pdb; pdb.set_trace()
+ # try:
+ # x = f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['QA'][0]['Q']} INPUT_MOTION_TOKENS: {example['input']}. \nASSISTANT: "
+ # except:
+ # import pdb; pdb.set_trace()
+ if example["input"]:
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} INPUT_VIDEO: {example['input']}. \nASSISTANT: "
+ )
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} ASSISTANT: "
+ )
+
+ # return (
+ # "Below is an instruction that describes a task, paired with an input that provides further context. "
+ # "Write a response that appropriately completes the request.\n\n"
+ # f"### Instruction:\n{example['instruction']}\n\n### Input:\n", "\n\n### Response:"
+ # )
+
+
+def generate_system_command():
+ return (
+ f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. "
+ )
+
+
+def main():
+ args = option.get_args_parser()
+ # prepare(split='train')
+ # prepare(split='val')
+ prepare(split='train_filter_wrong_decord_videos')
+ prepare(split='val_filter_wrong_decord_videos')
+
+
+if __name__ == "__main__":
+ main()