[tasks] update models, datasets, Spaces (#1686)

merveenoyan · pcuenca · Vaibhavs10 · web-flow · commit e966314257ad · 2025-08-15T18:14:28.000+02:00
Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
Co-authored-by: vb &lt;vaibhavs10@gmail.com&gt;
diff --git a/packages/tasks/src/tasks/any-to-any/data.ts b/packages/tasks/src/tasks/any-to-any/data.ts
@@ -40,21 +40,21 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Robust model that can take in image and text and generate image and text.",
-			id: "deepseek-ai/Janus-Pro-7B",
+			id: "OmniGen2/OmniGen2",
 		},
 		{
 			description: "Any-to-any model with speech, video, audio, image and text understanding capabilities.",
 			id: "openbmb/MiniCPM-o-2_6",
 		},
 		{
 			description: "A model that can understand image and text and generate image and text.",
-			id: "EPFL-VILAB/4M-21_XL",
+			id: "ByteDance-Seed/BAGEL-7B-MoT",
 		},
 	],
 	spaces: [
 		{
 			description: "An application to chat with an any-to-any (image & text) model.",
-			id: "deepseek-ai/Janus-Pro-7B",
+			id: "OmniGen2/OmniGen2",
 		},
 	],
 	summary: "Any-to-any models can understand two or more modalities and output two or more modalities.",
diff --git a/packages/tasks/src/tasks/image-segmentation/data.ts b/packages/tasks/src/tasks/image-segmentation/data.ts
@@ -44,8 +44,8 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			// TO DO: write description
-			description: "Solid semantic segmentation model trained on ADE20k.",
-			id: "openmmlab/upernet-convnext-small",
+			description: "Solid panoptic segmentation model trained on COCO.",
+			id: "tue-mps/coco_panoptic_eomt_large_640",
 		},
 		{
 			description: "Background removal model.",
diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts
@@ -47,67 +47,39 @@ const taskData: TaskDataCustom = {
 			id: "HuggingFaceTB/SmolVLM-Instruct",
 		},
 		{
-			description: "A screenshot understanding model used to control computers.",
-			id: "microsoft/OmniParser-v2.0",
+			description: "Cutting-edge reasoning vision language model.",
+			id: "zai-org/GLM-4.5V",
 		},
 		{
-			description: "Cutting-edge vision language model.",
-			id: "allenai/Molmo-7B-D-0924",
+			description: "Cutting-edge small vision language model to convert documents to text.",
+			id: "rednote-hilab/dots.ocr",
 		},
 		{
 			description: "Small yet powerful model.",
-			id: "vikhyatk/moondream2",
-		},
-		{
-			description: "Strong image-text-to-text model.",
-			id: "Qwen/Qwen2.5-VL-7B-Instruct",
+			id: "Qwen/Qwen2.5-VL-3B-Instruct",
 		},
 		{
 			description: "Image-text-to-text model with agentic capabilities.",
 			id: "microsoft/Magma-8B",
 		},
-		{
-			description: "Strong image-text-to-text model focused on documents.",
-			id: "allenai/olmOCR-7B-0225-preview",
-		},
-		{
-			description: "Small yet strong image-text-to-text model.",
-			id: "ibm-granite/granite-vision-3.2-2b",
-		},
 	],
 	spaces: [
 		{
 			description: "Leaderboard to evaluate vision language models.",
 			id: "opencompass/open_vlm_leaderboard",
 		},
 		{
-			description: "Vision language models arena, where models are ranked by votes of users.",
-			id: "WildVision/vision-arena",
-		},
-		{
-			description: "Powerful vision-language model assistant.",
-			id: "akhaliq/Molmo-7B-D-0924",
-		},
-		{
-			description: "Powerful vision language assistant that can understand multiple images.",
-			id: "HuggingFaceTB/SmolVLM2",
-		},
-		{
-			description: "An application for chatting with an image-text-to-text model.",
-			id: "GanymedeNil/Qwen2-VL-7B",
-		},
-		{
-			description: "An application that parses screenshots into actions.",
-			id: "showlab/ShowUI",
+			description: "An application that compares object detection capabilities of different vision language models.",
+			id: "sergiopaniego/vlm_object_understanding",
 		},
 		{
-			description: "An application that detects gaze.",
-			id: "moondream/gaze-demo",
+			description: "An application to compare different OCR models.",
+			id: "prithivMLmods/Multimodal-OCR",
 		},
 	],
 	summary:
 		"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
-	widgetModels: ["Qwen/Qwen2-VL-7B-Instruct"],
+	widgetModels: ["zai-org/GLM-4.5V"],
 	youtubeId: "IoGaGfU1CIg",
 };
 
diff --git a/packages/tasks/src/tasks/image-to-3d/data.ts b/packages/tasks/src/tasks/image-to-3d/data.ts
@@ -33,8 +33,8 @@ const taskData: TaskDataCustom = {
 			id: "TencentARC/InstantMesh",
 		},
 		{
-			description: "Fast image-to-3D mesh model by StabilityAI",
-			id: "stabilityai/TripoSR",
+			description: "3D world generation model.",
+			id: "tencent/HunyuanWorld-1",
 		},
 		{
 			description: "A scaled up image-to-3D mesh model derived from TripoSR.",
diff --git a/packages/tasks/src/tasks/image-to-image/data.ts b/packages/tasks/src/tasks/image-to-image/data.ts
@@ -53,16 +53,16 @@ const taskData: TaskDataCustom = {
 			id: "fal/AuraSR-v2",
 		},
 		{
-			description: "A model that increases the resolution of an image.",
-			id: "keras-io/super-resolution",
+			description: "Powerful image editing model.",
+			id: "black-forest-labs/FLUX.1-Kontext-dev",
 		},
 		{
-			description: "A model for applying edits to images through image controls.",
-			id: "Yuanshi/OminiControl",
+			description: "Virtual try-on model.",
+			id: "yisol/IDM-VTON",
 		},
 		{
-			description: "A model that generates images based on segments in the input image and the text prompt.",
-			id: "mfidabel/controlnet-segment-anything",
+			description: "Image re-lighting model.",
+			id: "kontext-community/relighting-kontext-dev-lora-v3",
 		},
 		{
 			description: "Strong model for inpainting and outpainting.",
@@ -75,33 +75,21 @@ const taskData: TaskDataCustom = {
 	],
 	spaces: [
 		{
-			description: "Image enhancer application for low light.",
-			id: "keras-io/low-light-image-enhancement",
+			description: "Image editing application.",
+			id: "black-forest-labs/FLUX.1-Kontext-Dev",
 		},
 		{
-			description: "Style transfer application.",
-			id: "keras-io/neural-style-transfer",
+			description: "Image relighting application.",
+			id: "lllyasviel/iclight-v2-vary",
 		},
 		{
-			description: "An application that generates images based on segment control.",
-			id: "mfidabel/controlnet-segment-anything",
-		},
-		{
-			description: "Image generation application that takes image control and text prompt.",
-			id: "hysts/ControlNet",
-		},
-		{
-			description: "Colorize any image using this app.",
-			id: "ioclab/brightness-controlnet",
-		},
-		{
-			description: "Edit images with instructions.",
-			id: "timbrooks/instruct-pix2pix",
+			description: "An application for image upscaling.",
+			id: "jasperai/Flux.1-dev-Controlnet-Upscaler",
 		},
 	],
 	summary:
 		"Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
-	widgetModels: ["stabilityai/stable-diffusion-2-inpainting"],
+	widgetModels: ["Qwen/Qwen-Image"],
 	youtubeId: "",
 };
 
diff --git a/packages/tasks/src/tasks/image-to-text/data.ts b/packages/tasks/src/tasks/image-to-text/data.ts
@@ -31,46 +31,26 @@ const taskData: TaskDataCustom = {
 	metrics: [],
 	models: [
 		{
-			description: "A robust image captioning model.",
-			id: "Salesforce/blip2-opt-2.7b",
+			description: "Strong OCR model.",
+			id: "allenai/olmOCR-7B-0725",
 		},
 		{
-			description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
-			id: "microsoft/kosmos-2-patch14-224",
-		},
-		{
-			description: "A strong optical character recognition model.",
-			id: "facebook/nougat-base",
-		},
-		{
-			description: "A powerful model that lets you have a conversation with the image.",
-			id: "llava-hf/llava-1.5-7b-hf",
+			description: "Powerful image captioning model.",
+			id: "fancyfeast/llama-joycaption-beta-one-hf-llava",
 		},
 	],
 	spaces: [
 		{
-			description: "An application that compares various image captioning models.",
-			id: "nielsr/comparing-captioning-models",
-		},
-		{
-			description: "A robust image captioning application.",
-			id: "flax-community/image-captioning",
-		},
-		{
-			description: "An application that transcribes handwritings into text.",
-			id: "nielsr/TrOCR-handwritten",
-		},
-		{
-			description: "An application that can caption images and answer questions about a given image.",
-			id: "Salesforce/BLIP",
+			description: "SVG generator app from images.",
+			id: "multimodalart/OmniSVG-3B",
 		},
 		{
-			description: "An application that can caption images and answer questions with a conversational agent.",
-			id: "Salesforce/BLIP2",
+			description: "An application that converts documents to markdown.",
+			id: "numind/NuMarkdown-8B-Thinking",
 		},
 		{
-			description: "An image captioning application that demonstrates the effect of noise on captions.",
-			id: "johko/capdec-image-captioning",
+			description: "An application that can caption images.",
+			id: "fancyfeast/joy-caption-beta-one",
 		},
 	],
 	summary:
diff --git a/packages/tasks/src/tasks/keypoint-detection/data.ts b/packages/tasks/src/tasks/keypoint-detection/data.ts
@@ -33,11 +33,11 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Strong keypoint detection model used to detect human pose.",
-			id: "facebook/sapiens-pose-1b",
+			id: "qualcomm/RTMPose-Body2d",
 		},
 		{
-			description: "Powerful keypoint detection model used to detect human pose.",
-			id: "usyd-community/vitpose-plus-base",
+			description: "Powerful keypoint matching model.",
+			id: "ETH-CVG/lightglue_disk",
 		},
 	],
 	spaces: [
@@ -46,8 +46,8 @@ const taskData: TaskDataCustom = {
 			id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
 		},
 		{
-			description: "An application to try a universal keypoint detection model.",
-			id: "merve/SuperPoint",
+			description: "An application for keypoint detection and matching.",
+			id: "ETH-CVG/LightGlue",
 		},
 	],
 	summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
diff --git a/packages/tasks/src/tasks/object-detection/data.ts b/packages/tasks/src/tasks/object-detection/data.ts
@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {
 	],
 	spaces: [
 		{
-			description: "Leaderboard to compare various object detection models across several metrics.",
-			id: "hf-vision/object_detection_leaderboard",
+			description: "Real-time object detection demo.",
+			id: "Roboflow/RF-DETR",
 		},
 		{
 			description: "An application that contains various object detection models to try from.",
diff --git a/packages/tasks/src/tasks/text-generation/data.ts b/packages/tasks/src/tasks/text-generation/data.ts
@@ -63,20 +63,20 @@ const taskData: TaskDataCustom = {
 	models: [
 		{ description: "A text-generation model trained to follow instructions.", id: "google/gemma-2-2b-it" },
 		{
-			description: "Smaller variant of one of the most powerful models.",
-			id: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+			description: "Powerful text generation model for coding.",
+			id: "Qwen/Qwen3-Coder-480B-A35B-Instruct",
 		},
 		{
-			description: "Very powerful text generation model trained to follow instructions.",
-			id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
+			description: "Great text generation model with top-notch tool calling capabilities.",
+			id: "openai/gpt-oss-120b",
 		},
 		{
-			description: "Powerful text generation model by Microsoft.",
-			id: "microsoft/phi-4",
+			description: "Powerful text generation model.",
+			id: "zai-org/GLM-4.5",
 		},
 		{
-			description: "A very powerful model with reasoning capabilities.",
-			id: "simplescaling/s1.1-32B",
+			description: "A powerful small model with reasoning capabilities.",
+			id: "Qwen/Qwen3-4B-Thinking-2507",
 		},
 		{
 			description: "Strong conversational model that supports very long instructions.",
@@ -93,8 +93,12 @@ const taskData: TaskDataCustom = {
 	],
 	spaces: [
 		{
-			description: "A leaderboard to compare different open-source text generation models based on various benchmarks.",
-			id: "open-llm-leaderboard/open_llm_leaderboard",
+			description: "An application that writes and executes code from text instructions and supports many models.",
+			id: "akhaliq/anycoder",
+		},
+		{
+			description: "An application that builds websites from natural language prompts.",
+			id: "enzostvs/deepsite",
 		},
 		{
 			description: "A leaderboard for comparing chain-of-thought performance of models.",
diff --git a/packages/tasks/src/tasks/text-to-image/data.ts b/packages/tasks/src/tasks/text-to-image/data.ts
@@ -50,19 +50,19 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			description: "One of the most powerful image generation models that can generate realistic outputs.",
-			id: "black-forest-labs/FLUX.1-dev",
+			id: "black-forest-labs/FLUX.1-Krea-dev",
 		},
 		{
-			description: "A powerful yet fast image generation model.",
-			id: "latent-consistency/lcm-lora-sdxl",
+			description: "A powerful image generation model.",
+			id: "Qwen/Qwen-Image",
 		},
 		{
-			description: "Text-to-image model for photorealistic generation.",
-			id: "Kwai-Kolors/Kolors",
+			description: "Powerful and fast image generation model.",
+			id: "ByteDance/SDXL-Lightning",
 		},
 		{
 			description: "A powerful text-to-image model.",
-			id: "stabilityai/stable-diffusion-3-medium-diffusers",
+			id: "ByteDance/Hyper-SD",
 		},
 	],
 	spaces: [
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
diff --git a/packages/tasks/src/tasks/text-to-video/data.ts b/packages/tasks/src/tasks/text-to-video/data.ts
diff --git a/packages/tasks/src/tasks/visual-document-retrieval/data.ts b/packages/tasks/src/tasks/visual-document-retrieval/data.ts
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/data.ts b/packages/tasks/src/tasks/zero-shot-object-detection/data.ts

Original file line number	Diff line number	Diff line change
`@@ -44,8 +44,8 @@ const taskData: TaskDataCustom = {`
`44`	`44`	`models: [`
`45`	`45`	`{`
`46`	`46`	`// TO DO: write description`
`47`		`- description: "Solid semantic segmentation model trained on ADE20k.",`
`48`		`- id: "openmmlab/upernet-convnext-small",`
	`47`	`+ description: "Solid panoptic segmentation model trained on COCO.",`
	`48`	`+ id: "tue-mps/coco_panoptic_eomt_large_640",`
`49`	`49`	`},`
`50`	`50`	`{`
`51`	`51`	`description: "Background removal model.",`
Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,8 @@ const taskData: TaskDataCustom = {`
`33`	`33`	`id: "TencentARC/InstantMesh",`
`34`	`34`	`},`
`35`	`35`	`{`
`36`		`- description: "Fast image-to-3D mesh model by StabilityAI",`
`37`		`- id: "stabilityai/TripoSR",`
	`36`	`+ description: "3D world generation model.",`
	`37`	`+ id: "tencent/HunyuanWorld-1",`
`38`	`38`	`},`
`39`	`39`	`{`
`40`	`40`	`description: "A scaled up image-to-3D mesh model derived from TripoSR.",`
Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {`
`61`	`61`	`],`
`62`	`62`	`spaces: [`
`63`	`63`	`{`
`64`		`- description: "Leaderboard to compare various object detection models across several metrics.",`
`65`		`- id: "hf-vision/object_detection_leaderboard",`
	`64`	`+ description: "Real-time object detection demo.",`
	`65`	`+ id: "Roboflow/RF-DETR",`
`66`	`66`	`},`
`67`	`67`	`{`
`68`	`68`	`description: "An application that contains various object detection models to try from.",`