Skip to content

Commit e966314

Browse files
merveenoyanpcuencaVaibhavs10
authored
[tasks] update models, datasets, Spaces (#1686)
Co-authored-by: Pedro Cuenca <[email protected]> Co-authored-by: vb <[email protected]>
1 parent 243db2b commit e966314

File tree

14 files changed

+91
-146
lines changed

14 files changed

+91
-146
lines changed

packages/tasks/src/tasks/any-to-any/data.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,21 @@ const taskData: TaskDataCustom = {
4040
},
4141
{
4242
description: "Robust model that can take in image and text and generate image and text.",
43-
id: "deepseek-ai/Janus-Pro-7B",
43+
id: "OmniGen2/OmniGen2",
4444
},
4545
{
4646
description: "Any-to-any model with speech, video, audio, image and text understanding capabilities.",
4747
id: "openbmb/MiniCPM-o-2_6",
4848
},
4949
{
5050
description: "A model that can understand image and text and generate image and text.",
51-
id: "EPFL-VILAB/4M-21_XL",
51+
id: "ByteDance-Seed/BAGEL-7B-MoT",
5252
},
5353
],
5454
spaces: [
5555
{
5656
description: "An application to chat with an any-to-any (image & text) model.",
57-
id: "deepseek-ai/Janus-Pro-7B",
57+
id: "OmniGen2/OmniGen2",
5858
},
5959
],
6060
summary: "Any-to-any models can understand two or more modalities and output two or more modalities.",

packages/tasks/src/tasks/image-segmentation/data.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ const taskData: TaskDataCustom = {
4444
models: [
4545
{
4646
// TO DO: write description
47-
description: "Solid semantic segmentation model trained on ADE20k.",
48-
id: "openmmlab/upernet-convnext-small",
47+
description: "Solid panoptic segmentation model trained on COCO.",
48+
id: "tue-mps/coco_panoptic_eomt_large_640",
4949
},
5050
{
5151
description: "Background removal model.",

packages/tasks/src/tasks/image-text-to-text/data.ts

Lines changed: 10 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -47,67 +47,39 @@ const taskData: TaskDataCustom = {
4747
id: "HuggingFaceTB/SmolVLM-Instruct",
4848
},
4949
{
50-
description: "A screenshot understanding model used to control computers.",
51-
id: "microsoft/OmniParser-v2.0",
50+
description: "Cutting-edge reasoning vision language model.",
51+
id: "zai-org/GLM-4.5V",
5252
},
5353
{
54-
description: "Cutting-edge vision language model.",
55-
id: "allenai/Molmo-7B-D-0924",
54+
description: "Cutting-edge small vision language model to convert documents to text.",
55+
id: "rednote-hilab/dots.ocr",
5656
},
5757
{
5858
description: "Small yet powerful model.",
59-
id: "vikhyatk/moondream2",
60-
},
61-
{
62-
description: "Strong image-text-to-text model.",
63-
id: "Qwen/Qwen2.5-VL-7B-Instruct",
59+
id: "Qwen/Qwen2.5-VL-3B-Instruct",
6460
},
6561
{
6662
description: "Image-text-to-text model with agentic capabilities.",
6763
id: "microsoft/Magma-8B",
6864
},
69-
{
70-
description: "Strong image-text-to-text model focused on documents.",
71-
id: "allenai/olmOCR-7B-0225-preview",
72-
},
73-
{
74-
description: "Small yet strong image-text-to-text model.",
75-
id: "ibm-granite/granite-vision-3.2-2b",
76-
},
7765
],
7866
spaces: [
7967
{
8068
description: "Leaderboard to evaluate vision language models.",
8169
id: "opencompass/open_vlm_leaderboard",
8270
},
8371
{
84-
description: "Vision language models arena, where models are ranked by votes of users.",
85-
id: "WildVision/vision-arena",
86-
},
87-
{
88-
description: "Powerful vision-language model assistant.",
89-
id: "akhaliq/Molmo-7B-D-0924",
90-
},
91-
{
92-
description: "Powerful vision language assistant that can understand multiple images.",
93-
id: "HuggingFaceTB/SmolVLM2",
94-
},
95-
{
96-
description: "An application for chatting with an image-text-to-text model.",
97-
id: "GanymedeNil/Qwen2-VL-7B",
98-
},
99-
{
100-
description: "An application that parses screenshots into actions.",
101-
id: "showlab/ShowUI",
72+
description: "An application that compares object detection capabilities of different vision language models.",
73+
id: "sergiopaniego/vlm_object_understanding",
10274
},
10375
{
104-
description: "An application that detects gaze.",
105-
id: "moondream/gaze-demo",
76+
description: "An application to compare different OCR models.",
77+
id: "prithivMLmods/Multimodal-OCR",
10678
},
10779
],
10880
summary:
10981
"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
110-
widgetModels: ["Qwen/Qwen2-VL-7B-Instruct"],
82+
widgetModels: ["zai-org/GLM-4.5V"],
11183
youtubeId: "IoGaGfU1CIg",
11284
};
11385

packages/tasks/src/tasks/image-to-3d/data.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ const taskData: TaskDataCustom = {
3333
id: "TencentARC/InstantMesh",
3434
},
3535
{
36-
description: "Fast image-to-3D mesh model by StabilityAI",
37-
id: "stabilityai/TripoSR",
36+
description: "3D world generation model.",
37+
id: "tencent/HunyuanWorld-1",
3838
},
3939
{
4040
description: "A scaled up image-to-3D mesh model derived from TripoSR.",

packages/tasks/src/tasks/image-to-image/data.ts

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,16 @@ const taskData: TaskDataCustom = {
5353
id: "fal/AuraSR-v2",
5454
},
5555
{
56-
description: "A model that increases the resolution of an image.",
57-
id: "keras-io/super-resolution",
56+
description: "Powerful image editing model.",
57+
id: "black-forest-labs/FLUX.1-Kontext-dev",
5858
},
5959
{
60-
description: "A model for applying edits to images through image controls.",
61-
id: "Yuanshi/OminiControl",
60+
description: "Virtual try-on model.",
61+
id: "yisol/IDM-VTON",
6262
},
6363
{
64-
description: "A model that generates images based on segments in the input image and the text prompt.",
65-
id: "mfidabel/controlnet-segment-anything",
64+
description: "Image re-lighting model.",
65+
id: "kontext-community/relighting-kontext-dev-lora-v3",
6666
},
6767
{
6868
description: "Strong model for inpainting and outpainting.",
@@ -75,33 +75,21 @@ const taskData: TaskDataCustom = {
7575
],
7676
spaces: [
7777
{
78-
description: "Image enhancer application for low light.",
79-
id: "keras-io/low-light-image-enhancement",
78+
description: "Image editing application.",
79+
id: "black-forest-labs/FLUX.1-Kontext-Dev",
8080
},
8181
{
82-
description: "Style transfer application.",
83-
id: "keras-io/neural-style-transfer",
82+
description: "Image relighting application.",
83+
id: "lllyasviel/iclight-v2-vary",
8484
},
8585
{
86-
description: "An application that generates images based on segment control.",
87-
id: "mfidabel/controlnet-segment-anything",
88-
},
89-
{
90-
description: "Image generation application that takes image control and text prompt.",
91-
id: "hysts/ControlNet",
92-
},
93-
{
94-
description: "Colorize any image using this app.",
95-
id: "ioclab/brightness-controlnet",
96-
},
97-
{
98-
description: "Edit images with instructions.",
99-
id: "timbrooks/instruct-pix2pix",
86+
description: "An application for image upscaling.",
87+
id: "jasperai/Flux.1-dev-Controlnet-Upscaler",
10088
},
10189
],
10290
summary:
10391
"Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
104-
widgetModels: ["stabilityai/stable-diffusion-2-inpainting"],
92+
widgetModels: ["Qwen/Qwen-Image"],
10593
youtubeId: "",
10694
};
10795

packages/tasks/src/tasks/image-to-text/data.ts

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,46 +31,26 @@ const taskData: TaskDataCustom = {
3131
metrics: [],
3232
models: [
3333
{
34-
description: "A robust image captioning model.",
35-
id: "Salesforce/blip2-opt-2.7b",
34+
description: "Strong OCR model.",
35+
id: "allenai/olmOCR-7B-0725",
3636
},
3737
{
38-
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
39-
id: "microsoft/kosmos-2-patch14-224",
40-
},
41-
{
42-
description: "A strong optical character recognition model.",
43-
id: "facebook/nougat-base",
44-
},
45-
{
46-
description: "A powerful model that lets you have a conversation with the image.",
47-
id: "llava-hf/llava-1.5-7b-hf",
38+
description: "Powerful image captioning model.",
39+
id: "fancyfeast/llama-joycaption-beta-one-hf-llava",
4840
},
4941
],
5042
spaces: [
5143
{
52-
description: "An application that compares various image captioning models.",
53-
id: "nielsr/comparing-captioning-models",
54-
},
55-
{
56-
description: "A robust image captioning application.",
57-
id: "flax-community/image-captioning",
58-
},
59-
{
60-
description: "An application that transcribes handwritings into text.",
61-
id: "nielsr/TrOCR-handwritten",
62-
},
63-
{
64-
description: "An application that can caption images and answer questions about a given image.",
65-
id: "Salesforce/BLIP",
44+
description: "SVG generator app from images.",
45+
id: "multimodalart/OmniSVG-3B",
6646
},
6747
{
68-
description: "An application that can caption images and answer questions with a conversational agent.",
69-
id: "Salesforce/BLIP2",
48+
description: "An application that converts documents to markdown.",
49+
id: "numind/NuMarkdown-8B-Thinking",
7050
},
7151
{
72-
description: "An image captioning application that demonstrates the effect of noise on captions.",
73-
id: "johko/capdec-image-captioning",
52+
description: "An application that can caption images.",
53+
id: "fancyfeast/joy-caption-beta-one",
7454
},
7555
],
7656
summary:

packages/tasks/src/tasks/keypoint-detection/data.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@ const taskData: TaskDataCustom = {
3333
},
3434
{
3535
description: "Strong keypoint detection model used to detect human pose.",
36-
id: "facebook/sapiens-pose-1b",
36+
id: "qualcomm/RTMPose-Body2d",
3737
},
3838
{
39-
description: "Powerful keypoint detection model used to detect human pose.",
40-
id: "usyd-community/vitpose-plus-base",
39+
description: "Powerful keypoint matching model.",
40+
id: "ETH-CVG/lightglue_disk",
4141
},
4242
],
4343
spaces: [
@@ -46,8 +46,8 @@ const taskData: TaskDataCustom = {
4646
id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
4747
},
4848
{
49-
description: "An application to try a universal keypoint detection model.",
50-
id: "merve/SuperPoint",
49+
description: "An application for keypoint detection and matching.",
50+
id: "ETH-CVG/LightGlue",
5151
},
5252
],
5353
summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",

packages/tasks/src/tasks/object-detection/data.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {
6161
],
6262
spaces: [
6363
{
64-
description: "Leaderboard to compare various object detection models across several metrics.",
65-
id: "hf-vision/object_detection_leaderboard",
64+
description: "Real-time object detection demo.",
65+
id: "Roboflow/RF-DETR",
6666
},
6767
{
6868
description: "An application that contains various object detection models to try from.",

packages/tasks/src/tasks/text-generation/data.ts

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,20 +63,20 @@ const taskData: TaskDataCustom = {
6363
models: [
6464
{ description: "A text-generation model trained to follow instructions.", id: "google/gemma-2-2b-it" },
6565
{
66-
description: "Smaller variant of one of the most powerful models.",
67-
id: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
66+
description: "Powerful text generation model for coding.",
67+
id: "Qwen/Qwen3-Coder-480B-A35B-Instruct",
6868
},
6969
{
70-
description: "Very powerful text generation model trained to follow instructions.",
71-
id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
70+
description: "Great text generation model with top-notch tool calling capabilities.",
71+
id: "openai/gpt-oss-120b",
7272
},
7373
{
74-
description: "Powerful text generation model by Microsoft.",
75-
id: "microsoft/phi-4",
74+
description: "Powerful text generation model.",
75+
id: "zai-org/GLM-4.5",
7676
},
7777
{
78-
description: "A very powerful model with reasoning capabilities.",
79-
id: "simplescaling/s1.1-32B",
78+
description: "A powerful small model with reasoning capabilities.",
79+
id: "Qwen/Qwen3-4B-Thinking-2507",
8080
},
8181
{
8282
description: "Strong conversational model that supports very long instructions.",
@@ -93,8 +93,12 @@ const taskData: TaskDataCustom = {
9393
],
9494
spaces: [
9595
{
96-
description: "A leaderboard to compare different open-source text generation models based on various benchmarks.",
97-
id: "open-llm-leaderboard/open_llm_leaderboard",
96+
description: "An application that writes and executes code from text instructions and supports many models.",
97+
id: "akhaliq/anycoder",
98+
},
99+
{
100+
description: "An application that builds websites from natural language prompts.",
101+
id: "enzostvs/deepsite",
98102
},
99103
{
100104
description: "A leaderboard for comparing chain-of-thought performance of models.",

packages/tasks/src/tasks/text-to-image/data.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,19 @@ const taskData: TaskDataCustom = {
5050
models: [
5151
{
5252
description: "One of the most powerful image generation models that can generate realistic outputs.",
53-
id: "black-forest-labs/FLUX.1-dev",
53+
id: "black-forest-labs/FLUX.1-Krea-dev",
5454
},
5555
{
56-
description: "A powerful yet fast image generation model.",
57-
id: "latent-consistency/lcm-lora-sdxl",
56+
description: "A powerful image generation model.",
57+
id: "Qwen/Qwen-Image",
5858
},
5959
{
60-
description: "Text-to-image model for photorealistic generation.",
61-
id: "Kwai-Kolors/Kolors",
60+
description: "Powerful and fast image generation model.",
61+
id: "ByteDance/SDXL-Lightning",
6262
},
6363
{
6464
description: "A powerful text-to-image model.",
65-
id: "stabilityai/stable-diffusion-3-medium-diffusers",
65+
id: "ByteDance/Hyper-SD",
6666
},
6767
],
6868
spaces: [

0 commit comments

Comments
 (0)