update the eps in config

UCSC-VLAA · Dec 17, 2024 · b27cf68 · b27cf68
1 parent 59e376d
commit b27cf68
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -94,9 +94,7 @@ with torch.no_grad(), torch.cuda.amp.autocast():
 
 print("Label probs:", text_probs)  # prints: [[0., 0., 0., 1.0]]
 ```
-#### Note:
-#### 1. We made modifications to the tokenizer implementation in open_clip/tokenizer.py.
-#### 2. Due to differences in the default epsilon values for LayerNorm initialization between JAX and PyTorch, we adjusted the default epsilon value in open_clip/transformer.py to align the model's behavior.
+#### Note: We made modifications to the tokenizer implementation in open_clip/tokenizer.py.
 
 ## Acknowledgement
 

diff --git a/open_clip/factory.py b/open_clip/factory.py
@@ -346,9 +346,6 @@ def create_model(
         else:
             model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
     else:
-        if 'CLIPS' in model_name:
-            model_cfg['vision_cfg']['eps'] = 1e-6
-            model_cfg['text_cfg']['eps'] = 1e-6
         model = CLIP(**model_cfg, cast_dtype=cast_dtype)
 
     if precision in ("fp16", "bf16"):

diff --git a/open_clip/model.py b/open_clip/model.py
@@ -31,7 +31,6 @@ class CLIPVisionCfg:
     mlp_ratio: float = 4.0
     patch_size: int = 16
     image_size: Union[Tuple[int, int], int] = 224
-    eps: float = 1e-5
 
     ls_init_value: Optional[float] = None  # layer scale initial value
     patch_dropout: float = 0.  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
@@ -77,7 +76,6 @@ class CLIPTextCfg:
     output_tokens: bool = False
     act_kwargs: dict = None
     norm_kwargs: dict = None
-    eps: float = 1e-5
 
     # HuggingFace specific text tower config
     hf_model_name: Optional[str] = None
@@ -168,7 +166,6 @@ def _build_vision_tower(
             output_dim=embed_dim,
             act_layer=act_layer,
             norm_layer=norm_layer,
-            eps=vision_cfg.eps,
         )
 
     return visual
@@ -218,7 +215,6 @@ def _build_text_tower(
             output_tokens=text_cfg.output_tokens,
             act_layer=act_layer,
             norm_layer=norm_layer,
-            eps=text_cfg.eps,
         )
     return text
 

diff --git a/open_clip/model_configs/ViT-H-14-CLIPS-224.json b/open_clip/model_configs/ViT-H-14-CLIPS-224.json
@@ -9,7 +9,10 @@
         "patch_size": 14,
         "no_ln_pre": true,
         "pool_type": "avg",
-        "final_ln_after_pool": true
+        "final_ln_after_pool": true,
+        "norm_kwargs": {
+            "eps":  1e-6
+        }
       },
       "text_cfg": {
         "context_length": 80,
@@ -25,6 +28,9 @@
         "no_causal_mask": true,
         "act_kwargs": {
           "approximate": "tanh"
+        },
+        "norm_kwargs": {
+            "eps":  1e-6
         }
       }
     },

diff --git a/open_clip/model_configs/ViT-L-14-CLIPS-224.json b/open_clip/model_configs/ViT-L-14-CLIPS-224.json
@@ -8,7 +8,10 @@
         "patch_size": 14,
         "no_ln_pre": true,
         "pool_type": "avg",
-        "final_ln_after_pool": true
+        "final_ln_after_pool": true,
+        "norm_kwargs": {
+            "eps":  1e-6
+        }
       },
       "text_cfg": {
         "context_length": 80,
@@ -24,6 +27,9 @@
         "no_causal_mask": true,
         "act_kwargs": {
           "approximate": "tanh"
+        },
+        "norm_kwargs": {
+            "eps":  1e-6
         }
       }
     },

diff --git a/open_clip/model_configs/ViT-L-14-CLIPS-336.json b/open_clip/model_configs/ViT-L-14-CLIPS-336.json
@@ -8,7 +8,10 @@
         "patch_size": 14,
         "no_ln_pre": true,
         "pool_type": "avg",
-        "final_ln_after_pool": true
+        "final_ln_after_pool": true,
+        "norm_kwargs": {
+            "eps":  1e-6
+        }
       },
       "text_cfg": {
         "context_length": 80,
@@ -24,6 +27,9 @@
         "no_causal_mask": true,
         "act_kwargs": {
           "approximate": "tanh"
+        },
+        "norm_kwargs": {
+            "eps":  1e-6
         }
       }
     },

diff --git a/open_clip/transformer.py b/open_clip/transformer.py
@@ -455,7 +455,6 @@ def __init__(
             act_layer: Callable = nn.GELU,
             norm_layer: Callable = LayerNorm,
             output_tokens: bool = False,
-            eps: float = 1e-5  # Add eps as a parameter
     ):
         super().__init__()
         assert pool_type in ('tok', 'avg', 'none')
@@ -488,15 +487,15 @@ def __init__(
         # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
         self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
 
-        self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width, eps=eps)
+        self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width)
         self.transformer = Transformer(
             width,
             layers,
             heads,
             mlp_ratio,
             ls_init_value=ls_init_value,
             act_layer=act_layer,
-            norm_layer=lambda x: norm_layer(x, eps=eps),
+            norm_layer=norm_layer,
         )
 
         if attentional_pool:
@@ -534,7 +533,7 @@ def __init__(
             pool_dim = width
             self.pool_type = pool_type
 
-        self.ln_post = norm_layer(pool_dim, eps=eps)
+        self.ln_post = norm_layer(pool_dim)
         self.proj = nn.Parameter(scale * torch.randn(pool_dim, output_dim))
 
         self.init_parameters()
@@ -694,7 +693,6 @@ def __init__(
             act_layer: Callable = nn.GELU,
             norm_layer: Callable = LayerNorm,
             output_tokens: bool = False,
-            eps: float = 1e-5  # Add eps as a parameter
     ):
         super().__init__()
         assert pool_type in ('first', 'last', 'argmax', 'none')
@@ -721,9 +719,9 @@ def __init__(
             mlp_ratio=mlp_ratio,
             ls_init_value=ls_init_value,
             act_layer=act_layer,
-            norm_layer=lambda x: norm_layer(x, eps=eps),
+            norm_layer=norm_layer,
         )
-        self.ln_final = norm_layer(width, eps=eps)
+        self.ln_final = norm_layer(width)
 
         if no_causal_mask:
             self.attn_mask = None
@@ -925,4 +923,4 @@ def forward(self, image_embs, text_embs):
 
     @torch.jit.ignore
     def set_grad_checkpointing(self, enable=True):
-        self.grad_checkpointing = enable
+        self.grad_checkpointing = enable