@@ -1817,6 +1817,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18171817 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
18181818 input_size = (3 , 256 , 256 ),
18191819 num_classes = 0 ),
1820+ 'vit_base_patch16_siglip_256.webli_i18n' : _cfg (
1821+ hf_hub_id = 'timm/ViT-B-16-SigLIP-i18n-256' ,
1822+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1823+ input_size = (3 , 256 , 256 ),
1824+ num_classes = 0 ),
18201825 'vit_base_patch16_siglip_384.webli' : _cfg (
18211826 hf_hub_id = 'timm/ViT-B-16-SigLIP-384' ,
18221827 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1841,6 +1846,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18411846 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP' ,
18421847 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
18431848 num_classes = 0 ),
1849+ 'vit_so400m_patch16_siglip_256.webli_i18n' : _cfg (
1850+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1851+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1852+ input_size = (3 , 256 , 256 ),
1853+ num_classes = 0 ),
1854+ 'vit_so400m_patch14_siglip_378.webli' : _cfg (
1855+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1856+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1857+ input_size = (3 , 378 , 378 ),
1858+ num_classes = 0 ),
18441859 'vit_so400m_patch14_siglip_384.webli' : _cfg (
18451860 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
18461861 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1856,6 +1871,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18561871 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
18571872 input_size = (3 , 256 , 256 ),
18581873 num_classes = 0 ),
1874+ 'vit_base_patch16_siglip_gap_256.webli_i18n' : _cfg (
1875+ hf_hub_id = 'timm/ViT-B-16-SigLIP-i18n-256' ,
1876+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1877+ input_size = (3 , 256 , 256 ),
1878+ num_classes = 0 ),
18591879 'vit_base_patch16_siglip_gap_384.webli' : _cfg (
18601880 hf_hub_id = 'timm/ViT-B-16-SigLIP-384' ,
18611881 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1890,6 +1910,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18901910 hf_hub_filename = 'paligemma-3b-pt-224.npz' ,
18911911 custom_load = 'hf' ,
18921912 num_classes = 0 ),
1913+ 'vit_so400m_patch16_siglip_gap_256.webli_i18n' : _cfg (
1914+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1915+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1916+ input_size = (3 , 256 , 256 ),
1917+ num_classes = 0 ),
1918+ 'vit_so400m_patch14_siglip_gap_378.webli' : _cfg (
1919+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1920+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1921+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 ,
1922+ num_classes = 0 ),
18931923 'vit_so400m_patch14_siglip_gap_384.webli' : _cfg (
18941924 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
18951925 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1914,6 +1944,15 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19141944 input_size = (3 , 896 , 896 ), crop_pct = 1.0 ,
19151945 num_classes = 0 ),
19161946
1947+ 'vit_so400m_patch14_siglip_378.webli_ft_in1k' : _cfg (
1948+ hf_hub_id = 'timm/' ,
1949+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1950+ ),
1951+ 'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k' : _cfg (
1952+ hf_hub_id = 'timm/' ,
1953+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1954+ ),
1955+
19171956 'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
19181957 hf_hub_id = 'timm/' ,
19191958 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -2935,6 +2974,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
29352974 return model
29362975
29372976
2977+ @register_model
2978+ def vit_so400m_patch16_siglip_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2979+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2980+ model_args = dict (
2981+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2982+ )
2983+ model = _create_vision_transformer (
2984+ 'vit_so400m_patch16_siglip_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2985+ return model
2986+
2987+
2988+ @register_model
2989+ def vit_so400m_patch14_siglip_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2990+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2991+ model_args = dict (
2992+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2993+ )
2994+ model = _create_vision_transformer (
2995+ 'vit_so400m_patch14_siglip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2996+ return model
2997+
2998+
29382999@register_model
29393000def vit_so400m_patch14_siglip_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
29403001 model_args = dict (
@@ -3023,6 +3084,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
30233084 return model
30243085
30253086
3087+ @register_model
3088+ def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3089+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3090+ model_args = dict (
3091+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3092+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3093+ )
3094+ model = _create_vision_transformer (
3095+ 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3096+ return model
3097+
3098+
3099+ @register_model
3100+ def vit_so400m_patch14_siglip_gap_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3101+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3102+ model_args = dict (
3103+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3104+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3105+ )
3106+ model = _create_vision_transformer (
3107+ 'vit_so400m_patch14_siglip_gap_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3108+ return model
3109+
3110+
30263111@register_model
30273112def vit_so400m_patch14_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
30283113 """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
0 commit comments