11""" EVA
22
3- EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
3+ EVA ViT from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
44
55@article{EVA,
66 title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
1818 year={2023}
1919}
2020
21- This file contains EVA & EVA02 model implementations evolved from BEiT, additional models in vision_transformer.py.
21+ @article{bolya2025perception,
22+ title={Perception encoder: The best visual embeddings are not at the output of the network},
23+ author={Bolya, Daniel and Huang, Po-Yao and Sun, Peize and Cho, Jang Hyun and Madotto, Andrea and Wei, Chen and Ma,
24+ Tengyu and Zhi, Jiale and Rajasegaran, Jathushan and Rasheed, Hanoona and others},
25+ journal={arXiv preprint arXiv:2504.13181},
26+ year={2025}
27+ }
28+
29+ This file contains a number of ViT variants the utilise ROPE position embeddings, SwiGLU and other additions:
30+ * EVA & EVA02 model implementations that evolved from BEiT, additional models in vision_transformer.py.
31+ * `timm` original SBB ViT w/ ROPE position embeddings
32+ * Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)
2233
2334Modifications by / Copyright 2023 Ross Wightman, original copyrights below
2435"""
@@ -1295,30 +1306,31 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
12951306
12961307@register_model
12971308def eva_giant_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1298- """ EVA-g model https://arxiv.org/abs/2211.07636 """
1309+ """EVA-g model https://arxiv.org/abs/2211.07636"""
12991310 model_args = dict (patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 )
13001311 model = _create_eva ('eva_giant_patch14_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
13011312 return model
13021313
13031314
13041315@register_model
13051316def eva_giant_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1306- """ EVA-g model https://arxiv.org/abs/2211.07636 """
1317+ """EVA-g model https://arxiv.org/abs/2211.07636"""
13071318 model_args = dict (patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 )
13081319 model = _create_eva ('eva_giant_patch14_336' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
13091320 return model
13101321
13111322
13121323@register_model
13131324def eva_giant_patch14_560 (pretrained : bool = False , ** kwargs ) -> Eva :
1314- """ EVA-g model https://arxiv.org/abs/2211.07636 """
1325+ """EVA-g model https://arxiv.org/abs/2211.07636"""
13151326 model_args = dict (patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 )
13161327 model = _create_eva ('eva_giant_patch14_560' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
13171328 return model
13181329
13191330
13201331@register_model
13211332def eva02_tiny_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1333+ """EVA02 Tiny https://arxiv.org/abs/2303.11331"""
13221334 model_args = dict (
13231335 img_size = 224 ,
13241336 patch_size = 14 ,
@@ -1336,6 +1348,7 @@ def eva02_tiny_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13361348
13371349@register_model
13381350def eva02_small_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1351+ """EVA02 Small https://arxiv.org/abs/2303.11331"""
13391352 model_args = dict (
13401353 img_size = 224 ,
13411354 patch_size = 14 ,
@@ -1353,6 +1366,7 @@ def eva02_small_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13531366
13541367@register_model
13551368def eva02_base_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1369+ """EVA02 Base https://arxiv.org/abs/2303.11331"""
13561370 model_args = dict (
13571371 img_size = 224 ,
13581372 patch_size = 14 ,
@@ -1372,6 +1386,7 @@ def eva02_base_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13721386
13731387@register_model
13741388def eva02_large_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1389+ """EVA02 Large https://arxiv.org/abs/2303.11331"""
13751390 model_args = dict (
13761391 img_size = 224 ,
13771392 patch_size = 14 ,
@@ -1391,6 +1406,7 @@ def eva02_large_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13911406
13921407@register_model
13931408def eva02_tiny_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1409+ """EVA02 Tiny https://arxiv.org/abs/2303.11331"""
13941410 model_args = dict (
13951411 img_size = 336 ,
13961412 patch_size = 14 ,
@@ -1408,6 +1424,7 @@ def eva02_tiny_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
14081424
14091425@register_model
14101426def eva02_small_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1427+ """EVA02 Small https://arxiv.org/abs/2303.11331"""
14111428 model_args = dict (
14121429 img_size = 336 ,
14131430 patch_size = 14 ,
@@ -1425,6 +1442,7 @@ def eva02_small_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
14251442
14261443@register_model
14271444def eva02_base_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1445+ """EVA02 Base https://arxiv.org/abs/2303.11331"""
14281446 model_args = dict (
14291447 img_size = 448 ,
14301448 patch_size = 14 ,
@@ -1444,6 +1462,7 @@ def eva02_base_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
14441462
14451463@register_model
14461464def eva02_large_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1465+ """EVA02 Large https://arxiv.org/abs/2303.11331"""
14471466 model_args = dict (
14481467 img_size = 448 ,
14491468 patch_size = 14 ,
@@ -1463,7 +1482,7 @@ def eva02_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
14631482
14641483@register_model
14651484def eva_giant_patch14_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1466- """ EVA-g CLIP model (only difference from non-CLIP is the pooling) """
1485+ """EVA-g CLIP model (only difference from non-CLIP is the pooling)"""
14671486 model_args = dict (
14681487 patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 ,
14691488 global_pool = kwargs .pop ('global_pool' , 'token' ))
@@ -1473,7 +1492,7 @@ def eva_giant_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
14731492
14741493@register_model
14751494def eva02_base_patch16_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1476- """ A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_base """
1495+ """An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_base"""
14771496 model_args = dict (
14781497 img_size = 224 ,
14791498 patch_size = 16 ,
@@ -1495,7 +1514,7 @@ def eva02_base_patch16_clip_224(pretrained: bool = False, **kwargs) -> Eva:
14951514
14961515@register_model
14971516def eva02_large_patch14_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1498- """ A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large """
1517+ """An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_large"""
14991518 model_args = dict (
15001519 img_size = 224 ,
15011520 patch_size = 14 ,
@@ -1517,7 +1536,7 @@ def eva02_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
15171536
15181537@register_model
15191538def eva02_large_patch14_clip_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1520- """ A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large """
1539+ """An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_large"""
15211540 model_args = dict (
15221541 img_size = 336 ,
15231542 patch_size = 14 ,
@@ -1539,7 +1558,7 @@ def eva02_large_patch14_clip_336(pretrained: bool = False, **kwargs) -> Eva:
15391558
15401559@register_model
15411560def eva02_enormous_patch14_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1542- """ A EVA-CLIP specific variant that uses residual post-norm in blocks """
1561+ """An EVA-CLIP specific variant that uses residual post-norm in blocks"""
15431562 model_args = dict (
15441563 img_size = 224 ,
15451564 patch_size = 14 ,
@@ -1556,6 +1575,7 @@ def eva02_enormous_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
15561575
15571576@register_model
15581577def vit_medium_patch16_rope_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1578+ """timm SBB ViT with ROPE"""
15591579 model_args = dict (
15601580 img_size = 256 ,
15611581 patch_size = 16 ,
@@ -1577,6 +1597,7 @@ def vit_medium_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) ->
15771597
15781598@register_model
15791599def vit_mediumd_patch16_rope_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1600+ """timm SBB ViT with ROPE"""
15801601 model_args = dict (
15811602 img_size = 256 ,
15821603 patch_size = 16 ,
@@ -1598,6 +1619,7 @@ def vit_mediumd_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) ->
15981619
15991620@register_model
16001621def vit_betwixt_patch16_rope_reg4_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1622+ """timm SBB ViT with ROPE"""
16011623 model_args = dict (
16021624 img_size = 256 ,
16031625 patch_size = 16 ,
@@ -1619,6 +1641,7 @@ def vit_betwixt_patch16_rope_reg4_gap_256(pretrained: bool = False, **kwargs) ->
16191641
16201642@register_model
16211643def vit_base_patch16_rope_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1644+ """timm SBB ViT with ROPE"""
16221645 model_args = dict (
16231646 img_size = 256 ,
16241647 patch_size = 16 ,
@@ -1640,6 +1663,7 @@ def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Ev
16401663
16411664@register_model
16421665def vit_pe_core_base_patch16_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1666+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
16431667 model_args = dict (
16441668 patch_size = 16 ,
16451669 embed_dim = 768 ,
@@ -1663,6 +1687,7 @@ def vit_pe_core_base_patch16_224(pretrained: bool = False, **kwargs) -> Eva:
16631687
16641688@register_model
16651689def vit_pe_core_large_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1690+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
16661691 model_args = dict (
16671692 patch_size = 14 ,
16681693 embed_dim = 1024 ,
@@ -1686,6 +1711,7 @@ def vit_pe_core_large_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
16861711
16871712@register_model
16881713def vit_pe_core_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1714+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
16891715 model_args = dict (
16901716 patch_size = 14 ,
16911717 embed_dim = 1536 ,
@@ -1709,6 +1735,7 @@ def vit_pe_core_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
17091735
17101736@register_model
17111737def vit_pe_lang_large_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1738+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
17121739 model_args = dict (
17131740 patch_size = 14 ,
17141741 embed_dim = 1024 ,
@@ -1733,6 +1760,7 @@ def vit_pe_lang_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
17331760
17341761@register_model
17351762def vit_pe_lang_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1763+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
17361764 model_args = dict (
17371765 patch_size = 14 ,
17381766 embed_dim = 1536 ,
@@ -1756,6 +1784,7 @@ def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
17561784
17571785@register_model
17581786def vit_pe_spatial_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1787+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
17591788 model_args = dict (
17601789 patch_size = 14 ,
17611790 embed_dim = 1536 ,
0 commit comments