Skip to content

Commit

Permalink
Create CompareCaptionsGenerate.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
whats2000 committed Aug 26, 2024
1 parent 5121b48 commit 40a22c4
Showing 1 changed file with 262 additions and 0 deletions.
262 changes: 262 additions & 0 deletions src/ablation_experiment/CompareCaptionsGenerate.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-26T07:39:52.816157Z",
"start_time": "2024-08-26T07:39:52.795021Z"
}
},
"cell_type": "code",
"source": [
"import torch\n",
"from transformers import CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPImageProcessor\n",
"\n",
"from src.utils import device"
],
"id": "af51e4f14f8d4d4b",
"outputs": [],
"execution_count": 6
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# <div style=\"font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;\">Step 1: Set up the experiment</div>",
"id": "6b34e3ba439aaa4f"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## <div style=\"font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;\">Set up the cache for the experiment</div>",
"id": "7c2294918930c572"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-08-26T07:02:56.696609Z",
"start_time": "2024-08-26T07:02:54.348924Z"
}
},
"source": "cache = {}",
"outputs": [],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## <div style=\"font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;\">Same concept as script version here</div>",
"id": "9c64305289b9fe48"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-26T07:02:56.777576Z",
"start_time": "2024-08-26T07:02:56.775383Z"
}
},
"cell_type": "code",
"source": "CLIP_NAME = 'laion/CLIP-ViT-L-14-laion2B-s32B-b82K'",
"id": "22e5d9c8c2fc4547",
"outputs": [],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-26T07:03:03.069862Z",
"start_time": "2024-08-26T07:02:56.857715Z"
}
},
"cell_type": "code",
"source": [
"clip_text_encoder = CLIPTextModelWithProjection.from_pretrained(CLIP_NAME, torch_dtype=torch.float32, projection_dim=768)\n",
"clip_text_encoder = clip_text_encoder.float().to(device)\n",
"\n",
"print(\"clip text encoder loaded.\")\n",
"clip_text_encoder.eval()"
],
"id": "94a46a8e90581af4",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"clip text encoder loaded.\n"
]
},
{
"data": {
"text/plain": [
"CLIPTextModelWithProjection(\n",
" (text_model): CLIPTextTransformer(\n",
" (embeddings): CLIPTextEmbeddings(\n",
" (token_embedding): Embedding(49408, 768)\n",
" (position_embedding): Embedding(77, 768)\n",
" )\n",
" (encoder): CLIPEncoder(\n",
" (layers): ModuleList(\n",
" (0-11): 12 x CLIPEncoderLayer(\n",
" (self_attn): CLIPAttention(\n",
" (k_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" (q_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): CLIPMLP(\n",
" (activation_fn): GELUActivation()\n",
" (fc1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (fc2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (text_projection): Linear(in_features=768, out_features=768, bias=False)\n",
")"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-26T07:03:04.748384Z",
"start_time": "2024-08-26T07:03:03.204895Z"
}
},
"cell_type": "code",
"source": [
"clip_img_encoder = CLIPVisionModelWithProjection.from_pretrained(CLIP_NAME,torch_dtype=torch.float32, projection_dim=768)\n",
"\n",
"clip_img_encoder = clip_img_encoder.float().to(device)\n",
"print(\"clip img encoder loaded.\")\n",
"clip_img_encoder.eval()"
],
"id": "32f7fb2e83ce7d74",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"clip img encoder loaded.\n"
]
},
{
"data": {
"text/plain": [
"CLIPVisionModelWithProjection(\n",
" (vision_model): CLIPVisionTransformer(\n",
" (embeddings): CLIPVisionEmbeddings(\n",
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)\n",
" (position_embedding): Embedding(257, 1024)\n",
" )\n",
" (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (encoder): CLIPEncoder(\n",
" (layers): ModuleList(\n",
" (0-23): 24 x CLIPEncoderLayer(\n",
" (self_attn): CLIPAttention(\n",
" (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): CLIPMLP(\n",
" (activation_fn): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" )\n",
" (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (visual_projection): Linear(in_features=1024, out_features=768, bias=False)\n",
")"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-26T07:40:00.097035Z",
"start_time": "2024-08-26T07:40:00.093435Z"
}
},
"cell_type": "code",
"source": [
"print('CLIP preprocess pipeline is used')\n",
"preprocess = CLIPImageProcessor(\n",
" crop_size={'height': 224, 'width': 224},\n",
" do_center_crop=True,\n",
" do_convert_rgb=True,\n",
" do_normalize=True,\n",
" do_rescale=True,\n",
" do_resize=True,\n",
" image_mean=[0.48145466, 0.4578275, 0.40821073],\n",
" image_std=[0.26862954, 0.26130258, 0.27577711],\n",
" resample=3,\n",
" size={'shortest_edge': 224},\n",
")"
],
"id": "54ac20b43a2e8b69",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CLIP preprocess pipeline is used\n"
]
}
],
"execution_count": 7
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "5be046bb92d5e588"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 40a22c4

Please sign in to comment.