diff --git a/README.md b/README.md index 26c7e32..a6bdc61 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,10 @@ base_vit = ViT( emb_dropout = 0.1 ) -vit = Extractor(base_vit, return_embeddings_only = True) +vit = Extractor( + base_vit, + return_embeddings_only = True +) clip = CLIP( image_encoder = vit, @@ -105,6 +108,56 @@ loss = clip(text, images, text_mask = mask, return_loss = True) loss.backward() ``` +Finally, one can also have the text transformer be externally defined. It will need to return the embeddings including the CLS token, for now. + +```python +import torch +from x_clip import CLIP, TextTransformer + +from vit_pytorch import ViT +from vit_pytorch.extractor import Extractor + +base_vit = ViT( + image_size = 256, + patch_size = 32, + num_classes = 1000, + dim = 512, + depth = 6, + heads = 16, + mlp_dim = 2048, + dropout = 0.1, + emb_dropout = 0.1 +) + +image_encoder = Extractor( + base_vit, + return_embeddings_only = True +) + +text_encoder = TextTransformer( + dim = 512, + num_tokens = 10000, + max_seq_len = 256 + 1, + depth = 6, + heads = 8 +) + +clip = CLIP( + image_encoder = image_encoder, + text_encoder = text_encoder, + dim_image = 512, + dim_text = 512, + dim_latent = 512 +) + +text = torch.randint(0, 10000, (4, 256)) +images = torch.randn(4, 3, 256, 256) +mask = torch.ones_like(text).bool() + +loss = clip(text, images, text_mask = mask, return_loss = True) +loss.backward() +``` + ## Citations ```bibtex diff --git a/setup.py b/setup.py index 524d6be..b2b95d6 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'x-clip', packages = find_packages(exclude=[]), - version = '0.0.10', + version = '0.0.11', license='MIT', description = 'X-CLIP', author = 'Phil Wang', diff --git a/x_clip/__init__.py b/x_clip/__init__.py index 30f5d44..ff88501 100644 --- a/x_clip/__init__.py +++ b/x_clip/__init__.py @@ -1 +1 @@ -from x_clip.x_clip import CLIP +from x_clip.x_clip import CLIP, TextTransformer diff --git a/x_clip/x_clip.py b/x_clip/x_clip.py index 4fd3592..5fe39e1 100644 --- a/x_clip/x_clip.py +++ b/x_clip/x_clip.py @@ -212,6 +212,7 @@ def __init__( self, *, image_encoder = None, + text_encoder = None, dim_text = 512, dim_image = 512, dim_latent = 512, @@ -240,13 +241,16 @@ def __init__( # instantiate text transformer - self.text_transformer = TextTransformer( - dim = dim_text, - num_tokens = num_text_tokens + (1 if use_mlm else 0), - max_seq_len = text_seq_len, - depth = text_enc_depth, - heads = text_heads - ) + if exists(text_encoder): + self.text_transformer = text_encoder + else: + self.text_transformer = TextTransformer( + dim = dim_text, + num_tokens = num_text_tokens + (1 if use_mlm else 0), + max_seq_len = text_seq_len + 1, + depth = text_enc_depth, + heads = text_heads + ) # instantiate image transformer