Skip to content

Commit

Permalink
update the eps in config
Browse files Browse the repository at this point in the history
  • Loading branch information
Yanqing0327 committed Dec 17, 2024
1 parent 59e376d commit b27cf68
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 21 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ with torch.no_grad(), torch.cuda.amp.autocast():

print("Label probs:", text_probs) # prints: [[0., 0., 0., 1.0]]
```
#### Note:
#### 1. We made modifications to the tokenizer implementation in open_clip/tokenizer.py.
#### 2. Due to differences in the default epsilon values for LayerNorm initialization between JAX and PyTorch, we adjusted the default epsilon value in open_clip/transformer.py to align the model's behavior.
#### Note: We made modifications to the tokenizer implementation in open_clip/tokenizer.py.

## Acknowledgement

Expand Down
3 changes: 0 additions & 3 deletions open_clip/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,6 @@ def create_model(
else:
model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
else:
if 'CLIPS' in model_name:
model_cfg['vision_cfg']['eps'] = 1e-6
model_cfg['text_cfg']['eps'] = 1e-6
model = CLIP(**model_cfg, cast_dtype=cast_dtype)

if precision in ("fp16", "bf16"):
Expand Down
4 changes: 0 additions & 4 deletions open_clip/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class CLIPVisionCfg:
mlp_ratio: float = 4.0
patch_size: int = 16
image_size: Union[Tuple[int, int], int] = 224
eps: float = 1e-5

ls_init_value: Optional[float] = None # layer scale initial value
patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
Expand Down Expand Up @@ -77,7 +76,6 @@ class CLIPTextCfg:
output_tokens: bool = False
act_kwargs: dict = None
norm_kwargs: dict = None
eps: float = 1e-5

# HuggingFace specific text tower config
hf_model_name: Optional[str] = None
Expand Down Expand Up @@ -168,7 +166,6 @@ def _build_vision_tower(
output_dim=embed_dim,
act_layer=act_layer,
norm_layer=norm_layer,
eps=vision_cfg.eps,
)

return visual
Expand Down Expand Up @@ -218,7 +215,6 @@ def _build_text_tower(
output_tokens=text_cfg.output_tokens,
act_layer=act_layer,
norm_layer=norm_layer,
eps=text_cfg.eps,
)
return text

Expand Down
8 changes: 7 additions & 1 deletion open_clip/model_configs/ViT-H-14-CLIPS-224.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
"patch_size": 14,
"no_ln_pre": true,
"pool_type": "avg",
"final_ln_after_pool": true
"final_ln_after_pool": true,
"norm_kwargs": {
"eps": 1e-6
}
},
"text_cfg": {
"context_length": 80,
Expand All @@ -25,6 +28,9 @@
"no_causal_mask": true,
"act_kwargs": {
"approximate": "tanh"
},
"norm_kwargs": {
"eps": 1e-6
}
}
},
Expand Down
8 changes: 7 additions & 1 deletion open_clip/model_configs/ViT-L-14-CLIPS-224.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
"patch_size": 14,
"no_ln_pre": true,
"pool_type": "avg",
"final_ln_after_pool": true
"final_ln_after_pool": true,
"norm_kwargs": {
"eps": 1e-6
}
},
"text_cfg": {
"context_length": 80,
Expand All @@ -24,6 +27,9 @@
"no_causal_mask": true,
"act_kwargs": {
"approximate": "tanh"
},
"norm_kwargs": {
"eps": 1e-6
}
}
},
Expand Down
8 changes: 7 additions & 1 deletion open_clip/model_configs/ViT-L-14-CLIPS-336.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
"patch_size": 14,
"no_ln_pre": true,
"pool_type": "avg",
"final_ln_after_pool": true
"final_ln_after_pool": true,
"norm_kwargs": {
"eps": 1e-6
}
},
"text_cfg": {
"context_length": 80,
Expand All @@ -24,6 +27,9 @@
"no_causal_mask": true,
"act_kwargs": {
"approximate": "tanh"
},
"norm_kwargs": {
"eps": 1e-6
}
}
},
Expand Down
14 changes: 6 additions & 8 deletions open_clip/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,6 @@ def __init__(
act_layer: Callable = nn.GELU,
norm_layer: Callable = LayerNorm,
output_tokens: bool = False,
eps: float = 1e-5 # Add eps as a parameter
):
super().__init__()
assert pool_type in ('tok', 'avg', 'none')
Expand Down Expand Up @@ -488,15 +487,15 @@ def __init__(
# setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()

self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width, eps=eps)
self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width)
self.transformer = Transformer(
width,
layers,
heads,
mlp_ratio,
ls_init_value=ls_init_value,
act_layer=act_layer,
norm_layer=lambda x: norm_layer(x, eps=eps),
norm_layer=norm_layer,
)

if attentional_pool:
Expand Down Expand Up @@ -534,7 +533,7 @@ def __init__(
pool_dim = width
self.pool_type = pool_type

self.ln_post = norm_layer(pool_dim, eps=eps)
self.ln_post = norm_layer(pool_dim)
self.proj = nn.Parameter(scale * torch.randn(pool_dim, output_dim))

self.init_parameters()
Expand Down Expand Up @@ -694,7 +693,6 @@ def __init__(
act_layer: Callable = nn.GELU,
norm_layer: Callable = LayerNorm,
output_tokens: bool = False,
eps: float = 1e-5 # Add eps as a parameter
):
super().__init__()
assert pool_type in ('first', 'last', 'argmax', 'none')
Expand All @@ -721,9 +719,9 @@ def __init__(
mlp_ratio=mlp_ratio,
ls_init_value=ls_init_value,
act_layer=act_layer,
norm_layer=lambda x: norm_layer(x, eps=eps),
norm_layer=norm_layer,
)
self.ln_final = norm_layer(width, eps=eps)
self.ln_final = norm_layer(width)

if no_causal_mask:
self.attn_mask = None
Expand Down Expand Up @@ -925,4 +923,4 @@ def forward(self, image_embs, text_embs):

@torch.jit.ignore
def set_grad_checkpointing(self, enable=True):
self.grad_checkpointing = enable
self.grad_checkpointing = enable

0 comments on commit b27cf68

Please sign in to comment.