@@ -1841,6 +1841,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18411841 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP' ,
18421842 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
18431843 num_classes = 0 ),
1844+ 'vit_so400m_patch16_siglip_256.webli' : _cfg (
1845+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1846+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1847+ input_size = (3 , 256 , 256 ),
1848+ num_classes = 0 ),
1849+ 'vit_so400m_patch14_siglip_378.webli' : _cfg (
1850+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1851+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1852+ input_size = (3 , 378 , 378 ),
1853+ num_classes = 0 ),
18441854 'vit_so400m_patch14_siglip_384.webli' : _cfg (
18451855 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
18461856 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1890,6 +1900,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18901900 hf_hub_filename = 'paligemma-3b-pt-224.npz' ,
18911901 custom_load = 'hf' ,
18921902 num_classes = 0 ),
1903+ 'vit_so400m_patch16_siglip_gap_256.webli' : _cfg (
1904+ hf_hub_id = 'timm/ViT-SO400M-16-SigLIP-i18n-256' ,
1905+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1906+ input_size = (3 , 256 , 256 ),
1907+ num_classes = 0 ),
1908+ 'vit_so400m_patch14_siglip_gap_378.webli' : _cfg (
1909+ hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
1910+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1911+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 ,
1912+ num_classes = 0 ),
18931913 'vit_so400m_patch14_siglip_gap_384.webli' : _cfg (
18941914 hf_hub_id = 'timm/ViT-SO400M-14-SigLIP-384' ,
18951915 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -1914,6 +1934,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19141934 input_size = (3 , 896 , 896 ), crop_pct = 1.0 ,
19151935 num_classes = 0 ),
19161936
1937+ 'vit_so400m_patch14_siglip_378.webli_ft_in1k' : _cfg (
1938+ #hf_hub_id='timm/',
1939+ #file='vit_so400m_p14_378_map-8.pth',
1940+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1941+ ),
1942+ 'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k' : _cfg (
1943+ # hf_hub_id='timm/',
1944+ #file='vit_so400m_p14_378_gap-8.pth',
1945+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' ,
1946+ ),
1947+
19171948 'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
19181949 hf_hub_id = 'timm/' ,
19191950 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -2935,6 +2966,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
29352966 return model
29362967
29372968
2969+ @register_model
2970+ def vit_so400m_patch16_siglip_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2971+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2972+ model_args = dict (
2973+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2974+ )
2975+ model = _create_vision_transformer (
2976+ 'vit_so400m_patch16_siglip_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2977+ return model
2978+
2979+
2980+ @register_model
2981+ def vit_so400m_patch14_siglip_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2982+ # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
2983+ model_args = dict (
2984+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False , global_pool = 'map' ,
2985+ )
2986+ model = _create_vision_transformer (
2987+ 'vit_so400m_patch14_siglip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2988+ return model
2989+
2990+
29382991@register_model
29392992def vit_so400m_patch14_siglip_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
29402993 model_args = dict (
@@ -3023,6 +3076,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
30233076 return model
30243077
30253078
3079+ @register_model
3080+ def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3081+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3082+ model_args = dict (
3083+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3084+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3085+ )
3086+ model = _create_vision_transformer (
3087+ 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3088+ return model
3089+
3090+
3091+ @register_model
3092+ def vit_so400m_patch14_siglip_gap_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3093+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3094+ model_args = dict (
3095+ patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3096+ class_token = False , global_pool = 'avg' , fc_norm = False ,
3097+ )
3098+ model = _create_vision_transformer (
3099+ 'vit_so400m_patch14_siglip_gap_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3100+ return model
3101+
3102+
30263103@register_model
30273104def vit_so400m_patch14_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
30283105 """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
0 commit comments