diff --git a/README.md b/README.md
index fe1d7fd..42f2e41 100644
--- a/README.md
+++ b/README.md
@@ -161,6 +161,26 @@ bash evaluate.sh
Results will be generated in `run_logs/` with detailed metrics and analysis.
+## 🖥️ Configuration UI Tool
+
+AU-Harness includes a web-based configuration UI tool to help users easily create and customize evaluation configurations without manually editing YAML files.
+
+### Features:
+- Interactive task selection from all supported categories (Speech Recognition, Paralinguistics, Audio Understanding, etc.)
+- Model configuration with preset templates for common models like GPT-4o-mini and Gemini
+- Advanced options for filtering, judge settings, and generation parameters
+- Copy to clipboard or download functionality
+
+### Usage:
+1. Navigate to the `ui/` directory
+2. Open `index.html` in your web browser
+3. Select the tasks you want to evaluate from the categorized task list
+4. Configure your models by adding model endpoints, API keys, and parameters
+5. Adjust advanced options like sample limits, language filters, and judge settings
+6. Generate the YAML configuration, then copy or download it for use with `evaluate.sh`
+
+This tool simplifies the process of setting up complex evaluation runs by providing a user-friendly interface to build `config.yaml` files, making it easier to get started with AU-Harness evaluations.
+
## 💻 Usage
AU-Harness requires setting up a running configuration file (`config.yaml`) to define your evaluation parameters. This file controls which models, datasets, and metrics are used in your evaluation.
diff --git a/ui/README.md b/ui/README.md
new file mode 100644
index 0000000..24c8d32
--- /dev/null
+++ b/ui/README.md
@@ -0,0 +1,135 @@
+# AU-Harness UI Tool
+
+A user-friendly web interface for configuring and running audio model evaluations with the AU-Harness framework.
+
+## 🚀 Quick Start
+
+1. **Open the UI**: Simply open `index.html` in your web browser
+2. **Select Tasks**: Browse task categories and select specific tasks with their metrics
+3. **Configure Models**: Choose from preset models or configure custom endpoints
+4. **Generate Config**: Preview and download the generated YAML configuration
+
+## 📋 Features
+
+### Task Selection
+- **Visual Category Navigation**: 6 task categories with clear descriptions
+- **Smart Metric Filtering**: Automatically shows supported metrics for each task
+- **Multi-Selection Support**: Select multiple tasks across different categories
+- **Real-time Feedback**: Visual indicators show selected tasks and metrics
+
+### Model Configuration
+- **Preset Models**: Quick setup for common models (GPT-4o, Gemini, Qwen)
+- **Custom Model Support**: Add any OpenAI-compatible endpoint
+- **Sharding Configuration**: Automatic model instance management
+- **Connection Validation**: Built-in endpoint testing
+
+### Advanced Options
+- **Dataset Filtering**: Control sample limits, duration ranges, and language
+- **Judge Settings**: Configure LLM judges for evaluation
+- **Generation Parameters**: Override model parameters per task
+- **Prompt Customization**: Modify system and user prompts
+
+### Configuration Management
+- **YAML Preview**: See generated configuration
+- **Export Options**: Download as YAML file or copy to clipboard
+
+## 🛠️ Technical Details
+
+### Architecture
+- **Frontend**: Vanilla HTML5, CSS3, JavaScript (ES6+)
+- **No Dependencies**: Completely self-contained, no npm packages required
+- **Responsive Design**: Works on desktop, tablet, and mobile
+- **Modern CSS**: CSS Grid, Flexbox, Custom Properties
+- **Accessibility**: WCAG 2.1 compliant with semantic HTML
+
+### File Structure
+```
+ui/
+├── index.html # Main application page with HTML comments for sections
+├── styles.css # Complete styling with CSS custom properties and section comments
+├── app.js # Application logic with detailed function comments
+├── generate_tasks.py # Script to generate tasks.js and tasks.json with docstrings and comments
+├── tasks.js # Task categories and metrics data
+├── tasks.json # Task categories and metrics data
+└── README.md # This documentation
+```
+
+### Browser Support
+- Chrome 90+
+- Firefox 88+
+- Safari 14+
+- Edge 90+
+
+## 📖 Usage Guide
+
+### 1. Selecting Tasks
+1. Click on any category card to expand it
+2. Check the boxes next to desired tasks
+3. View selected metrics in the "Selected Tasks" section
+4. Remove tasks by clicking the "Remove" button
+
+### 2. Configuring Models
+1. Choose "Preset Models" for quick setup
+2. Check boxes next to desired models
+3. Or switch to "Custom Model" tab for custom endpoints
+4. Fill in model name, endpoint, and API key
+
+### 3. Advanced Configuration
+1. Set sample limits to control evaluation size
+2. Adjust duration filters for audio length constraints
+3. Select target language for evaluation
+4. Configure additional options as needed
+
+### 4. Generating Configuration
+1. Click "Generate Config" to create YAML
+2. Review the generated configuration in the preview
+3. Click "Download YAML" to save the file
+4. Use the config with AU-Harness evaluation engine
+
+### 5. Running Evaluations
+1. Click "Run Evaluation" to start the process
+2. Monitor progress in the Results Dashboard
+3. View scores and metrics as they complete
+4. Export results for further analysis
+
+
+## 🚀 Integration with AU-Harness
+
+The generated YAML configuration is fully compatible with the AU-Harness evaluation engine. Use it as follows:
+
+```bash
+# Using the generated config
+python evaluate.py --config your-config.yaml
+
+# Or with the UI-generated file
+python evaluate.py --config au-harness-config.yaml
+```
+
+
+## 🆘 Troubleshooting
+
+### Common Issues
+
+**Q: Configuration preview is empty**
+A: Make sure you've selected at least one task and one model before generating the config.
+
+**Q: Download doesn't work**
+A: Check your browser's download settings and ensure pop-ups are allowed for this site.
+
+**Q: Styling looks broken**
+A: Ensure you're opening `index.html` directly in a browser, not through a file:// path with restrictions.
+
+### Performance Tips
+
+- For large evaluations, consider reducing sample limits initially
+- Use preset models for faster setup
+- Clear browser cache if experiencing issues with updates
+
+## 📞 Support
+
+For issues with the UI tool, please check:
+1. Browser console for JavaScript errors
+2. Network tab for any failed resource loads
+3. This documentation for usage guidance
+
+For issues with the AU-Harness framework itself, please refer to the main project documentation.
diff --git a/ui/app.js b/ui/app.js
new file mode 100644
index 0000000..2abcf93
--- /dev/null
+++ b/ui/app.js
@@ -0,0 +1,965 @@
+// AU-Harness Configuration UI - Main Application Logic
+// This file handles the dynamic loading of tasks, model configuration,
+// and YAML generation for the AU-Harness evaluation framework.
+
+// Task categories and tasks data
+let taskCategories = {};
+let taskConfigs = {};
+
+// Theme management
+function toggleTheme() {
+ const currentTheme = document.documentElement.getAttribute('data-theme');
+ const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+
+ document.documentElement.setAttribute('data-theme', newTheme);
+ localStorage.setItem('theme', newTheme);
+
+ // Update toggle button
+ const icon = document.getElementById('theme-icon');
+ const text = document.getElementById('theme-text');
+
+ if (newTheme === 'dark') {
+ icon.textContent = '☀️';
+ text.textContent = 'Light Mode';
+ } else {
+ icon.textContent = '🌙';
+ text.textContent = 'Dark Mode';
+ }
+}
+
+// Load saved theme on page load
+function loadTheme() {
+ const savedTheme = localStorage.getItem('theme') || 'light';
+ document.documentElement.setAttribute('data-theme', savedTheme);
+
+ // Update toggle button
+ const icon = document.getElementById('theme-icon');
+ const text = document.getElementById('theme-text');
+
+ if (savedTheme === 'dark') {
+ icon.textContent = '☀️';
+ text.textContent = 'Light Mode';
+ } else {
+ icon.textContent = '🌙';
+ text.textContent = 'Dark Mode';
+ }
+}
+
+function formatDisplayLabel(key) {
+ return key
+ .replace(/_/g, ' ')
+ .replace(/\b\w/g, char => char.toUpperCase());
+}
+
+function formatConfigLabel(key) {
+ // Keep config names as-is (don't format them)
+ return key;
+}
+
+function sanitizeId(value) {
+ return value.replace(/[^a-zA-Z0-9_-]/g, '-');
+}
+
+// Load task categories from tasks.js (loaded via script tag)
+function loadTaskCategories() {
+ try {
+ // Check if TASKS_DATA is available (loaded from tasks.js)
+ if (typeof window.TASKS_DATA === 'undefined') {
+ throw new Error('TASKS_DATA not found. Please regenerate tasks.js by running generate_tasks.py');
+ }
+
+ const data = window.TASKS_DATA;
+ taskCategories = {};
+ taskConfigs = {};
+
+ Object.entries(data).forEach(([key, value]) => {
+ if (value && typeof value === 'object' && value.tasks) {
+ taskCategories[key] = value;
+ } else if (value && typeof value === 'object' && value.category) {
+ taskConfigs[key] = value;
+ }
+ });
+ } catch (error) {
+ console.error('Failed to load tasks data:', error);
+ alert('Failed to load tasks metadata. Please regenerate tasks.js via generate_tasks.py.');
+ throw error;
+ }
+}
+
+// Preset models configuration
+// These are common model configurations that users can quickly load
+const presetModels = {
+ "gpt-4o-mini": {
+ name: "gpt-4o-mini-audio-preview",
+ inference_type: "openai",
+ url: "${AZURE_ENDPOINT_URL}",
+ auth_token: "${AZURE_AUTH_TOKEN}",
+ api_version: "2025-01-01-preview",
+ delay: 100,
+ retry_attempts: 10,
+ timeout: 60,
+ batch_size: 300,
+ chunk_size: 30
+ },
+ "gemini-2.5-flash": {
+ name: "gemini-2.5-flash",
+ inference_type: "gemini",
+ location: "${GOOGLE_CLOUD_LOCATION}",
+ project_id: "${GOOGLE_CLOUD_PROJECT}",
+ model: "google/gemini-2.5-flash",
+ reasoning_effort: "medium",
+ delay: 150,
+ retry_attempts: 5,
+ timeout: 300,
+ batch_size: 100,
+ chunk_size: 30240
+ },
+ "qwen-2.5-omni": {
+ name: "qwen-2.5-omni",
+ inference_type: "vllm",
+ url: "${VLLM_ENDPOINT_URL}",
+ auth_token: "${VLLM_AUTH_TOKEN}",
+ delay: 180,
+ retry_attempts: 8,
+ timeout: 120,
+ batch_size: 50,
+ chunk_size: 40
+ }
+};
+
+// Application state
+// Holds the current configuration of selected tasks and models
+const state = {
+ selectedTasks: [],
+ models: [],
+ advancedOptions: {
+ sample_limit: 500,
+ min_duration: 1.0,
+ max_duration: 60.0,
+ language: "en",
+ accented: false,
+ metric_aggregation: "average",
+ judge_api_version: "",
+ judge_prompt_model_override: "",
+ judge_model: "gpt-4o-mini",
+ judge_type: "openai",
+ judge_api_endpoint: "${ENDPOINT_URL}",
+ judge_api_key: "${AUTH_TOKEN}",
+ judge_concurrency: 16,
+ judge_temperature: 0.0,
+ generation_params_override: "",
+ prompt_overrides: ""
+ }
+};
+
+let modelCount = 1;
+
+function addSelectedTask(entry) {
+ const exists = state.selectedTasks.some(
+ task =>
+ task.identifier === entry.identifier &&
+ task.metric === entry.metric &&
+ task.category === entry.category
+ );
+ if (!exists) {
+ state.selectedTasks.push(entry);
+ }
+}
+
+function removeSelectedTask(predicate) {
+ state.selectedTasks = state.selectedTasks.filter(task => !predicate(task));
+}
+
+function updateCategoryCardState(categoryCard) {
+ if (!categoryCard) return;
+ const hasCheckedTask = categoryCard.querySelector('.task-checkbox:checked');
+ const hasCheckedConfig = categoryCard.querySelector('.task-config-checkbox:checked');
+ if (hasCheckedTask || hasCheckedConfig) {
+ categoryCard.classList.add('selected');
+ } else {
+ categoryCard.classList.remove('selected');
+ }
+}
+
+// Initialize the application
+// Called when the DOM is fully loaded
+document.addEventListener('DOMContentLoaded', function() {
+ loadTheme();
+ loadTaskCategories();
+ initializeTaskCategories();
+ initializeTaskSelectionControls();
+ initializeModelConfiguration();
+ initializeAdvancedOptions();
+ initializePreviewActions();
+});
+
+// Initialize model configuration
+// Sets up event listeners for adding models and loading examples
+function initializeModelConfiguration() {
+ // Add model button
+ document.getElementById('add-model-btn').addEventListener('click', addNewModel);
+
+ // Load example button
+ document.getElementById('load-example-btn').addEventListener('click', loadExampleConfig);
+}
+
+// Add new model configuration
+// Creates a new model form section dynamically
+function addNewModel() {
+ const container = document.getElementById('models-container');
+ const modelIndex = modelCount++;
+
+ const modelDiv = document.createElement('div');
+ modelDiv.className = 'model-config-item';
+ modelDiv.dataset.modelIndex = modelIndex;
+
+ modelDiv.innerHTML = `
+
Model ${modelIndex + 1}
+
+
+
+ Advanced Config
+
+
+
+
+ `;
+
+ container.appendChild(modelDiv);
+ updateRemoveButtons();
+}
+
+// Remove model configuration
+function removeModel(modelIndex) {
+ const modelDiv = document.querySelector(`[data-model-index="${modelIndex}"]`);
+ if (modelDiv) {
+ modelDiv.remove();
+ updateRemoveButtons();
+ }
+}
+
+// Update remove buttons visibility
+// Hides remove button when only one model remains
+function updateRemoveButtons() {
+ const models = document.querySelectorAll('.model-config-item');
+ document.querySelectorAll('.remove-model-btn').forEach(btn => {
+ btn.style.display = models.length > 1 ? 'block' : 'none';
+ });
+}
+
+// Load example configuration
+// Fills the first model with a sample configuration
+function loadExampleConfig() {
+ const exampleConfig = {
+ name: "gpt-4o-mini-audio-preview",
+ displayName: "gpt-4o-mini-audio-preview-1",
+ inferenceType: "openai",
+ endpoint: "https://your-endpoint.openai.azure.com",
+ apiKey: "your-api-key-here",
+ authToken: "${AZURE_AUTH_TOKEN}",
+ apiVersion: "2025-01-01-preview",
+ location: "",
+ projectId: "",
+ delay: 100,
+ retry: 10,
+ timeout: 60,
+ batchSize: 300,
+ chunkSize: 30,
+ reasoningEffort: ""
+ };
+
+ // Fill the first model with example data
+ const firstModel = document.querySelector('.model-config-item');
+ if (firstModel) {
+ firstModel.querySelector('.model-name').value = exampleConfig.name;
+ firstModel.querySelector('.model-display-name').value = exampleConfig.displayName;
+ firstModel.querySelector('.model-inference-type').value = exampleConfig.inferenceType;
+ firstModel.querySelector('.model-endpoint').value = exampleConfig.endpoint;
+ firstModel.querySelector('.model-api-key').value = exampleConfig.apiKey;
+ firstModel.querySelector('.model-auth-token').value = exampleConfig.authToken;
+ firstModel.querySelector('.model-api-version').value = exampleConfig.apiVersion;
+ firstModel.querySelector('.model-location').value = exampleConfig.location;
+ firstModel.querySelector('.model-project-id').value = exampleConfig.projectId;
+ firstModel.querySelector('.model-delay').value = exampleConfig.delay;
+ firstModel.querySelector('.model-retry').value = exampleConfig.retry;
+ firstModel.querySelector('.model-timeout').value = exampleConfig.timeout;
+ firstModel.querySelector('.model-batch-size').value = exampleConfig.batchSize;
+ firstModel.querySelector('.model-chunk-size').value = exampleConfig.chunkSize;
+ firstModel.querySelector('.model-reasoning-effort').value = exampleConfig.reasoningEffort;
+ }
+}
+
+// Collect model configurations
+// Gathers all model data from the form into an array
+function collectModelConfigurations() {
+ const models = [];
+ document.querySelectorAll('.model-config-item').forEach(modelDiv => {
+ const model = {
+ name: modelDiv.querySelector('.model-name').value.trim(),
+ displayName: modelDiv.querySelector('.model-display-name').value.trim(),
+ inferenceType: modelDiv.querySelector('.model-inference-type').value,
+ endpoint: modelDiv.querySelector('.model-endpoint').value.trim(),
+ apiKey: modelDiv.querySelector('.model-api-key').value.trim(),
+ authToken: modelDiv.querySelector('.model-auth-token').value.trim() || undefined,
+ apiVersion: modelDiv.querySelector('.model-api-version').value.trim() || undefined,
+ location: modelDiv.querySelector('.model-location').value.trim() || undefined,
+ projectId: modelDiv.querySelector('.model-project-id').value.trim() || undefined,
+ delay: parseInt(modelDiv.querySelector('.model-delay').value) || 100,
+ retry: parseInt(modelDiv.querySelector('.model-retry').value) || 8,
+ timeout: parseInt(modelDiv.querySelector('.model-timeout').value) || 30,
+ batchSize: parseInt(modelDiv.querySelector('.model-batch-size').value) || 1,
+ chunkSize: parseInt(modelDiv.querySelector('.model-chunk-size').value) || 30,
+ reasoningEffort: modelDiv.querySelector('.model-reasoning-effort').value || undefined
+ };
+
+ // Only add if required fields are filled
+ if (model.name && model.displayName && model.inferenceType && model.endpoint && model.apiKey) {
+ models.push(model);
+ }
+ });
+
+ return models;
+}
+
+// Initialize task selection UI
+// Populates the task categories and their tasks
+function initializeTaskCategories() {
+ const container = document.getElementById('task-categories');
+ container.innerHTML = '';
+
+ Object.entries(taskCategories).forEach(([categoryKey, category]) => {
+ const categoryCard = document.createElement('div');
+ categoryCard.className = 'category-card expanded';
+ categoryCard.dataset.category = categoryKey;
+
+ const tasksMarkup = Object.entries(category.tasks).map(([taskKey, task]) => {
+ const configsForTask = task.configs || [];
+ const hasConfigs = configsForTask.length > 0;
+
+ if (hasConfigs) {
+ // Smart expand: auto-expand only if exactly 1 config
+ const shouldAutoExpand = configsForTask.length === 1;
+ const expandedClass = shouldAutoExpand ? 'expanded' : '';
+
+ // Create config options with metrics displayed next to each config
+ const configOptions = configsForTask.map(configKey => {
+ const configId = `config-${sanitizeId(`${categoryKey}-${taskKey}-${configKey}`)}`;
+ const metricsMarkup = task.metrics && task.metrics.length > 0
+ ? `${task.metrics.join(', ')}`
+ : '';
+ return `
+
+
+
+
+ `;
+ }).join('');
+
+ // Show bulk action buttons only if more than 1 config
+ const bulkActionsMarkup = configsForTask.length > 1 ? `
+
+
+
+
+ ` : '';
+
+ return `
+
+
+
+ ${bulkActionsMarkup}
+
+ ${configOptions}
+
+
+
+ `;
+ }
+
+ const checkboxId = `task-${sanitizeId(`${categoryKey}-${taskKey}`)}`;
+ return `
+
+
+
+
+ `;
+ }).join('');
+
+ categoryCard.innerHTML = `
+ ${category.name}
+ ${category.description}
+
+ ${tasksMarkup}
+
+ `;
+
+ container.appendChild(categoryCard);
+ });
+
+ document.querySelectorAll('.task-checkbox').forEach(checkbox => {
+ checkbox.addEventListener('change', handleTaskSelection);
+ });
+ document.querySelectorAll('.task-config-checkbox').forEach(checkbox => {
+ checkbox.addEventListener('change', handleConfigSelection);
+ });
+}
+
+// Handle task selection
+// Updates the state when a task checkbox is toggled
+function handleTaskSelection(event) {
+ const checkbox = event.target;
+ const categoryKey = checkbox.dataset.category;
+ const taskKey = checkbox.dataset.task;
+ const category = taskCategories[categoryKey];
+ const task = category?.tasks?.[taskKey];
+
+ if (!task) return;
+
+ if (checkbox.checked) {
+ task.metrics.forEach(metric => {
+ addSelectedTask({
+ category: categoryKey,
+ task: taskKey,
+ taskName: task.name,
+ config: null,
+ configName: null,
+ metric,
+ identifier: taskKey
+ });
+ });
+ } else {
+ removeSelectedTask(
+ t => t.category === categoryKey && t.task === taskKey && t.identifier === taskKey
+ );
+ }
+
+ updateCategoryCardState(checkbox.closest('.category-card'));
+}
+
+function handleConfigSelection(event) {
+ const checkbox = event.target;
+ const { category: categoryKey, task: taskKey, config: configKey } = checkbox.dataset;
+ const category = taskCategories[categoryKey];
+ const task = category?.tasks?.[taskKey];
+
+ if (!task) return;
+
+ if (checkbox.checked) {
+ task.metrics.forEach(metric => {
+ addSelectedTask({
+ category: categoryKey,
+ task: taskKey,
+ taskName: task.name,
+ config: configKey,
+ configName: formatConfigLabel(configKey),
+ metric,
+ identifier: configKey
+ });
+ });
+ } else {
+ removeSelectedTask(
+ t => t.category === categoryKey && t.task === taskKey && t.identifier === configKey
+ );
+ }
+
+ updateCategoryCardState(checkbox.closest('.category-card'));
+}
+
+// Initialize advanced options
+// Binds form inputs to state variables
+function initializeAdvancedOptions() {
+ const inputs = {
+ 'sample-limit': 'sample_limit',
+ 'min-duration': 'min_duration',
+ 'max-duration': 'max_duration',
+ 'language': 'language',
+ 'accented': 'accented',
+ 'metric-aggregation': 'metric_aggregation',
+ 'judge-api-version': 'judge_api_version',
+ 'judge-prompt-model-override': 'judge_prompt_model_override',
+ 'judge-model': 'judge_model',
+ 'judge-type': 'judge_type',
+ 'judge-api-endpoint': 'judge_api_endpoint',
+ 'judge-api-key': 'judge_api_key',
+ 'judge-concurrency': 'judge_concurrency',
+ 'judge-temperature': 'judge_temperature',
+ 'generation-params-override': 'generation_params_override',
+ 'prompt-overrides': 'prompt_overrides'
+ };
+
+ Object.entries(inputs).forEach(([id, stateKey]) => {
+ const element = document.getElementById(id);
+ if (element) {
+ element.addEventListener('change', function() {
+ if (element.type === 'checkbox') {
+ state.advancedOptions[stateKey] = element.checked;
+ } else {
+ state.advancedOptions[stateKey] = element.value;
+ }
+ });
+ // Set initial value
+ if (element.type === 'checkbox') {
+ element.checked = state.advancedOptions[stateKey];
+ } else {
+ element.value = state.advancedOptions[stateKey];
+ }
+ }
+ });
+}
+
+// Initialize preview actions
+// Sets up event listeners for config generation, copy, and download
+function initializePreviewActions() {
+ document.getElementById('generate-config').addEventListener('click', generateConfig);
+ document.getElementById('copy-config').addEventListener('click', copyConfig);
+ document.getElementById('download-config').addEventListener('click', downloadConfig);
+}
+
+// Generate configuration
+// Creates the YAML config from current state
+function generateConfig() {
+ const models = collectModelConfigurations();
+
+ if (state.selectedTasks.length === 0) {
+ alert('Please select at least one task');
+ return;
+ }
+
+ if (models.length === 0) {
+ alert('Please configure at least one model');
+ return;
+ }
+
+ // Generate timestamp in YYYYMMDD_HHMMSS format
+ const now = new Date();
+ const timestamp = now.getFullYear() +
+ String(now.getMonth() + 1).padStart(2, '0') +
+ String(now.getDate()).padStart(2, '0') + '_' +
+ String(now.getHours()).padStart(2, '0') +
+ String(now.getMinutes()).padStart(2, '0') +
+ String(now.getSeconds()).padStart(2, '0');
+
+ // Generate aggregate from selected tasks/configs
+ const metricGroups = {};
+ state.selectedTasks.forEach(task => {
+ const targetKey = task.identifier || task.task;
+ if (!metricGroups[task.metric]) metricGroups[task.metric] = [];
+ if (!metricGroups[task.metric].includes(targetKey)) {
+ metricGroups[task.metric].push(targetKey);
+ }
+ });
+ const aggregate = Object.entries(metricGroups).filter(([metric, tasks]) => tasks.length > 1).map(([metric, tasks]) => [metric, tasks]);
+
+ const config = {
+ task_metric: state.selectedTasks.map(task => [
+ task.identifier || task.task,
+ task.metric
+ ])
+ };
+
+ // Add aggregate if there are grouped metrics
+ if (aggregate.length > 0) {
+ config.aggregate = aggregate;
+ }
+
+ config.filter = {
+ num_samples: state.advancedOptions.sample_limit,
+ length_filter: [state.advancedOptions.min_duration, state.advancedOptions.max_duration],
+ language: state.advancedOptions.language,
+ accented: state.advancedOptions.accented
+ };
+
+ config.judge_settings = {
+ judge_model: state.advancedOptions.judge_model,
+ judge_type: state.advancedOptions.judge_type,
+ judge_api_endpoint: state.advancedOptions.judge_api_endpoint,
+ judge_api_key: state.advancedOptions.judge_api_key,
+ judge_concurrency: state.advancedOptions.judge_concurrency,
+ judge_temperature: state.advancedOptions.judge_temperature,
+ ...(state.advancedOptions.judge_api_version && { judge_api_version: state.advancedOptions.judge_api_version }),
+ ...(state.advancedOptions.judge_prompt_model_override && { judge_prompt_model_override: state.advancedOptions.judge_prompt_model_override })
+ };
+
+ config.logging = {
+ log_file: `run_${timestamp}.log`
+ };
+
+ // Add generation params override if provided
+ if (state.advancedOptions.generation_params_override.trim()) {
+ config.generation_params_override = state.advancedOptions.generation_params_override.trim();
+ }
+
+ // Add prompt overrides if provided
+ if (state.advancedOptions.prompt_overrides.trim()) {
+ config.prompt_overrides = state.advancedOptions.prompt_overrides.trim();
+ }
+
+ config.models = models.map((model, index) => {
+ const modelConfig = {
+ name: model.displayName,
+ inference_type: model.inferenceType,
+ url: model.endpoint,
+ model: model.name,
+ auth_token: model.authToken || model.apiKey,
+ delay: model.delay,
+ retry_attempts: model.retry,
+ timeout: model.timeout,
+ batch_size: model.batchSize,
+ chunk_size: model.chunkSize
+ };
+
+ // Add optional fields only if they have values
+ if (model.apiVersion) modelConfig.api_version = model.apiVersion;
+ if (model.location) modelConfig.location = model.location;
+ if (model.projectId) modelConfig.project_id = model.projectId;
+ if (model.reasoningEffort) modelConfig.reasoning_effort = model.reasoningEffort;
+
+ return modelConfig;
+ });
+
+ // Add metric_aggregation
+ if (state.advancedOptions.metric_aggregation !== "average") {
+ config.metric_aggregation = state.advancedOptions.metric_aggregation;
+ }
+
+ const yaml = generateYAML(config);
+ document.getElementById('config-preview').textContent = yaml;
+}
+
+// Generate YAML from config object
+// Recursively generates YAML string from a given object
+function generateYAML(obj, indent = 0) {
+ const spaces = ' '.repeat(indent);
+ let yaml = '';
+
+ if (Array.isArray(obj)) {
+ obj.forEach(item => {
+ if (typeof item === 'object') {
+ yaml += `${spaces}-\n`;
+ yaml += generateYAML(item, indent + 1);
+ } else {
+ yaml += `${spaces}- ${item}\n`;
+ }
+ });
+ } else if (typeof obj === 'object' && obj !== null) {
+ Object.entries(obj).forEach(([key, value]) => {
+ if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
+ yaml += `${spaces}${key}:\n`;
+ yaml += generateYAML(value, indent + 1);
+ } else if (Array.isArray(value)) {
+ yaml += `${spaces}${key}:\n`;
+ yaml += generateYAML(value, indent + 1);
+ } else {
+ yaml += `${spaces}${key}: ${value}\n`;
+ }
+ });
+ }
+
+ return yaml;
+}
+
+// Download configuration
+// Saves the generated YAML config to a file
+function downloadConfig() {
+ const config = document.getElementById('config-preview').textContent;
+
+ // Generate timestamp in YYYYMMDD_HHMMSS format
+ const now = new Date();
+ const timestamp = now.getFullYear() +
+ String(now.getMonth() + 1).padStart(2, '0') +
+ String(now.getDate()).padStart(2, '0') + '_' +
+ String(now.getHours()).padStart(2, '0') +
+ String(now.getMinutes()).padStart(2, '0') +
+ String(now.getSeconds()).padStart(2, '0');
+
+ const filename = `au-harness-config-${timestamp}.yaml`;
+
+ const blob = new Blob([config], { type: 'text/yaml' });
+ const url = URL.createObjectURL(blob);
+
+ const a = document.createElement('a');
+ a.href = url;
+ a.download = filename;
+ document.body.appendChild(a);
+ a.click();
+ document.body.removeChild(a);
+ URL.revokeObjectURL(url);
+}
+
+// Copy configuration to clipboard
+// Copies the generated YAML config to the clipboard
+async function copyConfig() {
+ const config = document.getElementById('config-preview').textContent;
+ const button = document.getElementById('copy-config');
+ const originalText = button.textContent;
+
+ try {
+ if (navigator.clipboard && window.isSecureContext) {
+ // Use the Clipboard API when available
+ await navigator.clipboard.writeText(config);
+ } else {
+ // Fallback for older browsers or non-HTTPS contexts
+ const textArea = document.createElement('textarea');
+ textArea.value = config;
+ textArea.style.position = 'fixed';
+ textArea.style.left = '-999999px';
+ textArea.style.top = '-999999px';
+ document.body.appendChild(textArea);
+ textArea.focus();
+ textArea.select();
+
+ try {
+ document.execCommand('copy');
+ } finally {
+ document.body.removeChild(textArea);
+ }
+ }
+
+ // Visual feedback
+ button.textContent = '✅ Copied!';
+ button.style.background = 'var(--success-color)';
+
+ setTimeout(() => {
+ button.textContent = originalText;
+ button.style.background = '';
+ }, 2000);
+
+ } catch (err) {
+ console.error('Failed to copy: ', err);
+ button.textContent = '❌ Failed';
+ button.style.background = 'var(--error-color)';
+
+ setTimeout(() => {
+ button.textContent = originalText;
+ button.style.background = '';
+ }, 2000);
+ }
+}
+
+// Initialize task selection controls
+// Sets up event listeners for select all and reset buttons
+function initializeTaskSelectionControls() {
+ document.getElementById('select-all-tasks').addEventListener('click', selectAllTasks);
+ document.getElementById('reset-selection').addEventListener('click', resetSelection);
+}
+
+// Select all tasks
+// Toggles all task checkboxes to checked
+function selectAllTasks() {
+ const taskCheckboxes = document.querySelectorAll('.task-checkbox');
+ taskCheckboxes.forEach(checkbox => {
+ if (!checkbox.checked) {
+ checkbox.checked = true;
+ checkbox.dispatchEvent(new Event('change'));
+ }
+ });
+
+ const configCheckboxes = document.querySelectorAll('.task-config-checkbox');
+ configCheckboxes.forEach(checkbox => {
+ if (!checkbox.checked) {
+ checkbox.checked = true;
+ checkbox.dispatchEvent(new Event('change'));
+ }
+ });
+}
+
+// Reset selection
+// Toggles all task checkboxes to unchecked
+function resetSelection() {
+ const taskCheckboxes = document.querySelectorAll('.task-checkbox');
+ taskCheckboxes.forEach(checkbox => {
+ if (checkbox.checked) {
+ checkbox.checked = false;
+ checkbox.dispatchEvent(new Event('change'));
+ }
+ });
+
+ const configCheckboxes = document.querySelectorAll('.task-config-checkbox');
+ configCheckboxes.forEach(checkbox => {
+ if (checkbox.checked) {
+ checkbox.checked = false;
+ checkbox.dispatchEvent(new Event('change'));
+ }
+ });
+}
+
+// Toggle task configs visibility
+function toggleTaskConfigs(headerElement) {
+ const taskItem = headerElement.closest('.task-item');
+ const icon = headerElement.querySelector('.expand-icon');
+
+ taskItem.classList.toggle('expanded');
+ icon.textContent = taskItem.classList.contains('expanded') ? '▼' : '▶';
+}
+
+// Select all configs for a specific task
+function selectAllConfigsForTask(categoryKey, taskKey, event) {
+ event.stopPropagation();
+ const checkboxes = document.querySelectorAll(
+ `.task-config-checkbox[data-category="${categoryKey}"][data-task="${taskKey}"]`
+ );
+ checkboxes.forEach(checkbox => {
+ if (!checkbox.checked) {
+ checkbox.checked = true;
+ checkbox.dispatchEvent(new Event('change'));
+ }
+ });
+}
+
+// Deselect all configs for a specific task
+function deselectAllConfigsForTask(categoryKey, taskKey, event) {
+ event.stopPropagation();
+ const checkboxes = document.querySelectorAll(
+ `.task-config-checkbox[data-category="${categoryKey}"][data-task="${taskKey}"]`
+ );
+ checkboxes.forEach(checkbox => {
+ if (checkbox.checked) {
+ checkbox.checked = false;
+ checkbox.dispatchEvent(new Event('change'));
+ }
+ });
+}
+
+// Filter configs by pattern
+function filterByPattern(pattern) {
+ const allConfigItems = document.querySelectorAll('.config-item');
+ let matchCount = 0;
+
+ allConfigItems.forEach(item => {
+ const label = item.querySelector('label');
+ if (label && label.textContent.toLowerCase().includes(pattern.toLowerCase())) {
+ item.style.display = 'flex';
+ matchCount++;
+ } else {
+ item.style.display = 'none';
+ }
+ });
+
+ // Auto-expand tasks that have matching configs
+ document.querySelectorAll('.task-item.has-configs').forEach(taskItem => {
+ const visibleConfigs = taskItem.querySelectorAll('.config-item[style*="flex"]');
+ if (visibleConfigs.length > 0) {
+ taskItem.classList.add('expanded');
+ const icon = taskItem.querySelector('.expand-icon');
+ if (icon) icon.textContent = '▼';
+ }
+ });
+
+ // Show feedback
+ if (matchCount === 0) {
+ alert(`No configs found matching "${pattern}"`);
+ }
+}
+
+// Clear filter
+function clearFilter() {
+ const allConfigItems = document.querySelectorAll('.config-item');
+ allConfigItems.forEach(item => {
+ item.style.display = 'flex';
+ });
+
+ // Restore default expand/collapse state (only 1-config tasks expanded)
+ document.querySelectorAll('.task-item.has-configs').forEach(taskItem => {
+ const configCount = taskItem.querySelectorAll('.config-item').length;
+ const shouldExpand = configCount === 1;
+
+ if (shouldExpand) {
+ taskItem.classList.add('expanded');
+ } else {
+ taskItem.classList.remove('expanded');
+ }
+
+ const icon = taskItem.querySelector('.expand-icon');
+ if (icon) icon.textContent = shouldExpand ? '▼' : '▶';
+ });
+}
diff --git a/ui/generate_tasks.py b/ui/generate_tasks.py
new file mode 100644
index 0000000..702b0ae
--- /dev/null
+++ b/ui/generate_tasks.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Script to dynamically generate tasks.json from the tasks folder structure.
+Loads metrics directly from the YAML files under the tasks directory instead of parsing README tables.
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, List
+
+import yaml
+
+
+def get_category_description(category):
+ """Return a human-friendly description for a task category."""
+ descriptions = {
+ "speech_recognition": "Tasks involving automatic speech recognition (ASR), including standard ASR, long-form ASR, and code-switching ASR.",
+ "paralinguistics": "Tasks that analyze non-verbal aspects of speech such as emotion, gender, accent, and speaker characteristics.",
+ "audio_understanding": "Tasks that require understanding of the general audio signals including but not limited to music, noise, sound.",
+ "spoken_language_understanding": "Tasks that require understanding of spoken language and/or audio information including QA, translation, summarization, and intent classification.",
+ "spoken_language_reasoning": "Tasks that require reasoning over spoken input, such as instruction following or logical/mathematical reasoning.",
+ "safety_and_security": "Tasks related to assessing model behavior around safety, robustness, and vulnerability to spoofing or adversarial content.",
+ "speech_enhancement": "Tasks related to speech quality improvement, noise detection, and audio enhancement.",
+ "speech_disorder": "Tasks related to detecting and analyzing speech disorders and voice pathologies.",
+ "phonetics": "Tasks related to phonetic analysis, phoneme recognition, and speech sound processing."
+ }
+
+ return descriptions.get(category, f"Tasks related to {category.replace('_', ' ')}.")
+
+
+def get_category_display_name(category):
+ """Get display name with emoji for category."""
+ display_names = {
+ "speech_recognition": "🗣️ Speech Recognition",
+ "paralinguistics": "🎭 Paralinguistics",
+ "audio_understanding": "🔊 Audio Understanding",
+ "spoken_language_understanding": "🧠 Spoken Language Understanding",
+ "spoken_language_reasoning": "🧩 Spoken Language Reasoning",
+ "safety_and_security": "🔐 Safety and Security",
+ "speech_enhancement": "✨ Speech Enhancement",
+ "speech_disorder": "🩺 Speech Disorder",
+ "phonetics": "📢 Phonetics"
+ }
+
+ return display_names.get(category, category.replace('_', ' ').title())
+
+
+def format_task_name(task_name):
+ """Format task name for display."""
+ return task_name.replace('_', ' ').title()
+
+
+def safe_load_yaml(yaml_path: Path) -> Dict:
+ """Safely load YAML content, returning an empty dict on failure."""
+ try:
+ with open(yaml_path, "r", encoding="utf-8") as f:
+ return yaml.safe_load(f) or {}
+ except yaml.YAMLError as exc:
+ print(f"Warning: Failed to parse YAML file {yaml_path}: {exc}")
+ except OSError as exc:
+ print(f"Warning: Failed to read YAML file {yaml_path}: {exc}")
+ return {}
+
+
+def extract_metrics_from_yaml(data: Dict) -> List[str]:
+ """Extract a list of metric names from a YAML object."""
+ metrics = []
+ yaml_metrics = data.get("metrics") if isinstance(data, dict) else None
+
+ if isinstance(yaml_metrics, list):
+ for item in yaml_metrics:
+ if isinstance(item, dict) and "metric" in item:
+ metrics.append(item["metric"])
+ elif isinstance(item, str):
+ metrics.append(item)
+
+ # Deduplicate while preserving order
+ seen = set()
+ unique_metrics = []
+ for metric in metrics:
+ if metric and metric not in seen:
+ seen.add(metric)
+ unique_metrics.append(metric)
+
+ return unique_metrics
+
+
+def collect_metrics_from_task_dir(task_dir: Path) -> List[str]:
+ """
+ Collect metrics for a task directory by inspecting its YAML files.
+ Preference is given to base.yaml if present; otherwise all YAML files under the directory
+ are scanned until metrics are found.
+ """
+ yaml_files = []
+ base_yaml = task_dir / "base.yaml"
+
+ if base_yaml.exists():
+ yaml_files.append(base_yaml)
+
+ for yaml_path in sorted(task_dir.rglob("*.yaml")):
+ if yaml_path == base_yaml:
+ continue
+ yaml_files.append(yaml_path)
+
+ for yaml_file in yaml_files:
+ data = safe_load_yaml(yaml_file)
+ metrics = extract_metrics_from_yaml(data)
+ if metrics:
+ return metrics
+
+ return []
+
+
+def collect_configs_from_task_dir(task_dir: Path) -> List[str]:
+ """Collect config identifiers from YAML files excluding base definitions."""
+ configs = []
+
+ for yaml_path in sorted(task_dir.rglob("*.yaml")):
+ # Skip base.yaml files at any level
+ if yaml_path.name.lower() == "base.yaml":
+ continue
+
+ data = safe_load_yaml(yaml_path)
+ config_name = data.get("task_name") or yaml_path.stem
+ if config_name and config_name not in configs:
+ configs.append(config_name)
+
+ return configs
+
+
+def load_task_categories_from_yaml(tasks_dir: Path):
+ """
+ Build the task categories dictionary by traversing the tasks directory.
+ Each top-level directory is treated as a category and each immediate
+ sub-directory is treated as a task whose metrics/configs are discovered from YAML.
+ Returns both the categories dictionary and a flat task metadata dictionary.
+ """
+ task_categories = {}
+ task_details = {}
+
+ for category_path in sorted(tasks_dir.iterdir(), key=lambda p: p.name):
+ if not category_path.is_dir():
+ continue
+
+ category_key = category_path.name
+ tasks = {}
+ for task_path in sorted(category_path.iterdir(), key=lambda p: p.name):
+ if not task_path.is_dir():
+ continue
+
+ metrics = collect_metrics_from_task_dir(task_path)
+ if not metrics:
+ continue
+
+ task_key = task_path.name
+
+ # Collect configs for this task
+ configs = collect_configs_from_task_dir(task_path)
+
+ task_info = {
+ "name": format_task_name(task_key),
+ "metrics": metrics
+ }
+
+ # Add configs to task_info if available
+ if configs:
+ task_info["configs"] = configs
+
+ tasks[task_key] = task_info
+
+ task_metadata = dict(task_info)
+ task_metadata["category"] = category_key
+
+ task_details[task_key] = task_metadata
+
+ if not tasks:
+ continue
+
+ task_categories[category_key] = {
+ "name": get_category_display_name(category_key),
+ "description": get_category_description(category_key),
+ "tasks": tasks
+ }
+
+ return task_categories, task_details
+
+
+def main():
+ """Main function to generate tasks.json and tasks.js from tasks folder."""
+
+ script_dir = Path(__file__).parent
+ project_root = script_dir.parent
+ tasks_dir = project_root / "tasks"
+
+ if not tasks_dir.exists():
+ print(f"Error: Tasks directory not found at {tasks_dir}")
+ return
+
+ task_categories, task_details = load_task_categories_from_yaml(tasks_dir)
+
+ if not task_categories:
+ print("No task categories were discovered from YAML files.")
+ return
+
+ combined_output = {**task_categories, **task_details}
+
+ # Generate tasks.json
+ json_output_file = script_dir / "tasks.json"
+ with open(json_output_file, 'w', encoding='utf-8') as f:
+ json.dump(combined_output, f, indent=2, ensure_ascii=False)
+
+ # Generate tasks.js for local file access (no server needed)
+ js_output_file = script_dir / "tasks.js"
+ with open(js_output_file, 'w', encoding='utf-8') as f:
+ f.write("// Auto-generated by generate_tasks.py\n")
+ f.write("// This file allows the UI to work when opening index.html directly in a browser\n")
+ f.write("window.TASKS_DATA = ")
+ json.dump(combined_output, f, indent=2, ensure_ascii=False)
+ f.write(";\n")
+
+ print(f"Successfully generated {json_output_file} and {js_output_file} with {len(task_categories)} task categories")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/ui/index.html b/ui/index.html
new file mode 100644
index 0000000..cd38b13
--- /dev/null
+++ b/ui/index.html
@@ -0,0 +1,266 @@
+
+
+
+
+
+ AU-Harness Configuration UI
+
+
+
+
+
+
+
+
+
+ 📊 Task Selection
+ Select tasks for evaluation
+
+
+
+
+
+
+
+
+
Quick Filters:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 🤖 Model Configuration
+ Configure one or more models for evaluation. Add multiple models to compare performance.
+
+
+
+
Model 1
+
+
+
+ Advanced Config
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ⚖️ Judge Configuration
+ Configure the judge model for LLM-based evaluation metrics. Required when selecting tasks with judge metrics.
+
+
+
+
+
+
+ ⚙️ Advanced Configuration
+
+ Evaluation Settings
+
+
+
+ Prompt Updates
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 📋 Configuration Preview
+
+
+
+
+
+ # Configuration will appear here...
+
+
+
+
+
+
+
+
+
diff --git a/ui/styles.css b/ui/styles.css
new file mode 100644
index 0000000..c83ead1
--- /dev/null
+++ b/ui/styles.css
@@ -0,0 +1,919 @@
+/* CSS Reset */
+* {
+ margin: 0;
+ padding: 0;
+ box-sizing: border-box;
+}
+
+/* CSS Variables */
+:root {
+ --primary-color: #2563eb;
+ --secondary-color: #64748b;
+ --success-color: #16a34a;
+ --warning-color: #d97706;
+ --error-color: #dc2626;
+ --background: #f8fafc;
+ --card-bg: #ffffff;
+ --text-primary: #1e293b;
+ --text-secondary: #64748b;
+ --border-color: #e2e8f0;
+ --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1);
+ --shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.1);
+ --header-gradient-start: #2563eb;
+ --header-gradient-end: #3b82f6;
+ --task-header-bg: #fafbfc;
+ --task-header-hover: #f1f5f9;
+ --filter-bg: #f8fafc;
+}
+
+/* Dark Mode Variables */
+[data-theme="dark"] {
+ --primary-color: #3b82f6;
+ --secondary-color: #94a3b8;
+ --success-color: #22c55e;
+ --warning-color: #f59e0b;
+ --error-color: #ef4444;
+ --background: #0f172a;
+ --card-bg: #1e293b;
+ --text-primary: #f1f5f9;
+ --text-secondary: #94a3b8;
+ --border-color: #334155;
+ --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.5);
+ --shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.5);
+ --header-gradient-start: #1e3a8a;
+ --header-gradient-end: #1e40af;
+ --task-header-bg: #334155;
+ --task-header-hover: #475569;
+ --filter-bg: #334155;
+}
+
+/* Base Styles */
+body {
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+ background: var(--background);
+ color: var(--text-primary);
+ line-height: 1.6;
+}
+
+/* Layout */
+.container {
+ max-width: 1200px;
+ margin: 0 auto;
+ padding: 20px;
+}
+
+header {
+ text-align: center;
+ margin-bottom: 40px;
+ padding: 40px 0;
+ background: linear-gradient(135deg, var(--header-gradient-start), var(--header-gradient-end));
+ color: white;
+ border-radius: 12px;
+ position: relative;
+}
+
+header h1 {
+ font-size: 2.5rem;
+ margin-bottom: 10px;
+}
+
+header p {
+ font-size: 1.1rem;
+ opacity: 0.9;
+}
+
+/* Dark Mode Toggle */
+.theme-toggle {
+ position: absolute;
+ top: 20px;
+ right: 20px;
+ background: rgba(255, 255, 255, 0.2);
+ border: 1px solid rgba(255, 255, 255, 0.3);
+ color: white;
+ padding: 8px 16px;
+ border-radius: 20px;
+ cursor: pointer;
+ font-size: 0.9rem;
+ transition: all 0.3s;
+ display: flex;
+ align-items: center;
+ gap: 6px;
+}
+
+.theme-toggle:hover {
+ background: rgba(255, 255, 255, 0.3);
+ transform: scale(1.05);
+}
+
+/* Cards */
+.card {
+ background: var(--card-bg);
+ border-radius: 12px;
+ padding: 24px;
+ margin-bottom: 24px;
+ box-shadow: var(--shadow);
+ border: 1px solid var(--border-color);
+}
+
+.card h2 {
+ font-size: 1.5rem;
+ margin-bottom: 20px;
+ color: var(--text-primary);
+ display: flex;
+ align-items: center;
+ gap: 8px;
+}
+
+/* Task Categories */
+.categories-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
+ gap: 24px;
+ margin-bottom: 24px;
+}
+
+.category-card {
+ border: 2px solid var(--border-color);
+ border-radius: 8px;
+ padding: 16px;
+ cursor: pointer;
+ transition: all 0.2s;
+}
+
+.category-card:hover {
+ border-color: var(--primary-color);
+ box-shadow: var(--shadow);
+}
+
+.category-card.selected {
+ border-color: var(--primary-color);
+ background: #eff6ff;
+}
+
+.category-card h3 {
+ font-size: 1.2rem;
+ margin-bottom: 8px;
+ color: var(--primary-color);
+}
+
+.category-card p {
+ color: var(--text-secondary);
+ font-size: 0.9rem;
+ margin-bottom: 12px;
+}
+
+.tasks-list {
+ display: none;
+ margin-top: 12px;
+}
+
+.category-card.expanded .tasks-list {
+ display: flex;
+ flex-direction: column;
+ gap: 12px;
+}
+
+.task-item {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ margin-bottom: 8px;
+ padding: 8px;
+ border-radius: 4px;
+ transition: background 0.2s;
+}
+
+.task-item:hover {
+ background: var(--background);
+}
+
+.task-item input[type="checkbox"] {
+ margin: 0;
+}
+
+.task-item label {
+ flex: 1;
+ cursor: pointer;
+ font-size: 0.9rem;
+}
+
+.task-item .task-title {
+ font-weight: 600;
+ display: inline-block;
+ margin-bottom: 4px;
+}
+
+.task-item.has-configs {
+ flex-direction: column;
+ align-items: flex-start;
+ border: 1px solid var(--border-color);
+ padding: 0;
+ margin-bottom: 12px;
+ min-width: 0;
+ overflow: hidden;
+}
+
+.task-header {
+ width: 100%;
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ padding: 12px;
+ cursor: pointer;
+ background: var(--task-header-bg);
+ transition: background 0.2s;
+}
+
+.task-header:hover {
+ background: var(--task-header-hover);
+}
+
+.task-item .task-info {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ flex-wrap: wrap;
+ min-width: 0;
+ flex: 1;
+}
+
+.config-count-badge {
+ background: var(--primary-color);
+ color: white;
+ padding: 2px 8px;
+ border-radius: 12px;
+ font-size: 0.75rem;
+ font-weight: 600;
+}
+
+.expand-icon {
+ font-size: 0.9rem;
+ color: var(--text-secondary);
+ user-select: none;
+}
+
+.config-options {
+ width: 100%;
+ padding: 12px;
+ border-top: 1px solid var(--border-color);
+ display: none;
+ flex-direction: column;
+ gap: 12px;
+ min-width: 0;
+ box-sizing: border-box;
+}
+
+.task-item.has-configs.expanded .config-options {
+ display: flex;
+}
+
+.config-actions {
+ display: flex;
+ gap: 8px;
+}
+
+.config-actions button {
+ padding: 4px 12px;
+ font-size: 0.85rem;
+ border: 1px solid var(--primary-color);
+ background: white;
+ color: var(--primary-color);
+ border-radius: 4px;
+ cursor: pointer;
+ transition: all 0.2s;
+ font-weight: 500;
+}
+
+.config-actions button:hover {
+ background: var(--primary-color);
+ color: white;
+ border-color: var(--primary-color);
+}
+
+.config-list {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 8px 16px;
+ max-height: 400px;
+ overflow-y: auto;
+ overflow-x: hidden;
+ padding: 8px 0;
+ min-width: 0;
+ width: 100%;
+}
+
+.config-label {
+ width: 100%;
+ font-size: 0.85rem;
+ font-weight: 600;
+ color: var(--text-secondary);
+}
+
+.config-item {
+ display: flex;
+ align-items: center;
+ gap: 6px;
+ font-size: 0.9rem;
+ min-width: 0;
+}
+
+.config-item label {
+ word-break: break-word;
+ overflow-wrap: break-word;
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ flex: 1;
+}
+
+.config-name {
+ font-weight: 500;
+}
+
+.config-metrics {
+ font-size: 0.75rem;
+ color: var(--text-secondary);
+ font-style: italic;
+}
+
+.config-item input[type="checkbox"] {
+ margin: 0;
+ flex-shrink: 0;
+}
+
+.metrics-chips {
+ display: flex;
+ gap: 4px;
+ flex-wrap: wrap;
+ margin-top: 4px;
+}
+
+.metric-chip {
+ background: var(--secondary-color);
+ color: white;
+ padding: 2px 8px;
+ border-radius: 12px;
+ font-size: 0.75rem;
+}
+
+/* Quick Filters */
+.quick-filters {
+ margin: 16px 0;
+ padding: 16px;
+ background: var(--filter-bg);
+ border-radius: 8px;
+ border: 1px solid var(--border-color);
+}
+
+.quick-filters h4 {
+ margin: 0 0 12px 0;
+ font-size: 0.9rem;
+ color: var(--text-secondary);
+}
+
+.filter-buttons {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 8px;
+}
+
+.filter-btn {
+ padding: 6px 14px;
+ font-size: 0.85rem;
+ border: 1px solid var(--primary-color);
+ background: white;
+ color: var(--primary-color);
+ border-radius: 6px;
+ cursor: pointer;
+ transition: all 0.2s;
+ font-weight: 500;
+}
+
+.filter-btn:hover {
+ background: var(--primary-color);
+ color: white;
+ border-color: var(--primary-color);
+}
+
+.filter-btn.secondary {
+ background: var(--secondary-color);
+ color: white;
+ border-color: var(--secondary-color);
+}
+
+.filter-btn.secondary:hover {
+ background: #475569;
+}
+
+.selected-tasks {
+ border-top: 1px solid var(--border-color);
+ padding-top: 20px;
+}
+
+.selected-task-item {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ padding: 8px 12px;
+ background: var(--background);
+ border-radius: 6px;
+ margin-bottom: 8px;
+}
+
+.selected-task-item button {
+ background: var(--error-color);
+ color: white;
+ border: none;
+ padding: 4px 8px;
+ border-radius: 4px;
+ cursor: pointer;
+ font-size: 0.8rem;
+}
+
+.task-selection-controls {
+ display: flex;
+ gap: 12px;
+ margin-bottom: 20px;
+ justify-content: center;
+}
+
+/* Model Configuration */
+.model-config-item {
+ border: 2px solid var(--border-color);
+ border-radius: 12px;
+ padding: 20px;
+ margin-bottom: 20px;
+ background: #fafbfc;
+}
+
+.model-config-item h3 {
+ margin-bottom: 16px;
+ color: var(--primary-color);
+ font-size: 1.2rem;
+}
+
+.model-actions {
+ display: flex;
+ gap: 12px;
+ margin-top: 20px;
+ padding-top: 20px;
+ border-top: 1px solid var(--border-color);
+}
+
+.remove-model-btn {
+ background: var(--error-color);
+ color: white;
+ border: none;
+ padding: 8px 16px;
+ border-radius: 6px;
+ cursor: pointer;
+ font-size: 0.9rem;
+ margin-top: 16px;
+}
+
+.remove-model-btn:hover {
+ background: #b91c1c;
+}
+
+#load-example-btn {
+ background: var(--secondary-color);
+}
+
+#load-example-btn:hover {
+ background: #475569;
+}
+
+/* Form Elements */
+.form-group {
+ margin-bottom: 16px;
+}
+
+.form-group label {
+ display: block;
+ margin-bottom: 6px;
+ font-weight: 500;
+ color: var(--text-primary);
+}
+
+.required-star {
+ color: #FF5C72;
+ font-weight: 500;
+}
+
+.form-group input,
+.form-group select {
+ width: 100%;
+ padding: 10px 12px;
+ border: 1px solid var(--border-color);
+ border-radius: 6px;
+ font-size: 0.95rem;
+ transition: border-color 0.2s;
+}
+
+.form-group input:focus,
+.form-group select:focus {
+ outline: none;
+ border-color: var(--primary-color);
+ box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+}
+
+.form-group textarea {
+ width: 100%;
+ padding: 10px 12px;
+ border: 1px solid var(--border-color);
+ border-radius: 6px;
+ font-size: 0.95rem;
+ font-family: inherit;
+ resize: vertical;
+ min-height: 100px;
+}
+
+.form-group textarea:focus {
+ outline: none;
+ border-color: var(--primary-color);
+ box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+}
+
+.form-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+ gap: 16px;
+}
+
+/* Buttons */
+button {
+ background: var(--primary-color);
+ color: white;
+ border: none;
+ padding: 10px 16px;
+ border-radius: 6px;
+ cursor: pointer;
+ font-size: 0.95rem;
+ font-weight: 500;
+ transition: all 0.2s;
+}
+
+button:hover {
+ background: #1d4ed8;
+ transform: translateY(-1px);
+}
+
+button:active {
+ transform: translateY(0);
+}
+
+button.secondary {
+ background: var(--secondary-color);
+}
+
+#copy-config {
+ margin-left: auto;
+}
+
+button.success {
+ background: var(--success-color);
+}
+
+button.success:hover {
+ background: #15803d;
+}
+
+/* Configuration Preview */
+#config-preview {
+ background: #1e293b;
+ color: #e2e8f0;
+ padding: 20px;
+ border-radius: 8px;
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+ font-size: 0.9rem;
+ line-height: 1.5;
+ overflow-x: auto;
+ white-space: pre-wrap;
+ max-height: 400px;
+ overflow-y: auto;
+}
+
+.preview-actions {
+ display: flex;
+ gap: 12px;
+ margin-bottom: 20px;
+ flex-wrap: wrap;
+}
+
+@media (max-width: 768px) {
+ .container {
+ padding: 12px;
+ }
+
+ header {
+ padding: 24px 16px;
+ margin-bottom: 24px;
+ }
+
+ header h1 {
+ font-size: 2rem;
+ }
+
+ header p {
+ font-size: 1rem;
+ }
+
+ .card {
+ padding: 16px;
+ margin-bottom: 16px;
+ }
+
+ .card h2 {
+ font-size: 1.3rem;
+ margin-bottom: 16px;
+ }
+
+ .categories-grid,
+ .model-grid {
+ grid-template-columns: 1fr;
+ gap: 12px;
+ }
+
+ .category-card {
+ padding: 12px;
+ }
+
+ .task-options .tasks-list {
+ grid-template-columns: 1fr;
+ gap: 8px;
+ }
+
+ .form-grid {
+ grid-template-columns: 1fr;
+ gap: 12px;
+ }
+
+ .form-group {
+ margin-bottom: 12px;
+ }
+
+ .preview-actions {
+ flex-direction: column;
+ gap: 8px;
+ }
+
+ .preview-actions button {
+ width: 100%;
+ padding: 12px;
+ }
+
+ button {
+ width: 100%;
+ padding: 12px;
+ font-size: 1rem;
+ }
+
+ .model-actions {
+ flex-direction: column;
+ gap: 8px;
+ }
+
+ .model-config-item {
+ padding: 16px;
+ }
+
+ .selected-task-item {
+ flex-direction: column;
+ align-items: flex-start;
+ gap: 8px;
+ }
+
+ .selected-task-item button {
+ align-self: flex-end;
+ }
+
+ .results-controls {
+ flex-direction: column;
+ gap: 8px;
+ }
+
+ .results-controls button {
+ width: 100%;
+ }
+
+ .results-grid {
+ grid-template-columns: 1fr;
+ }
+
+ #config-preview {
+ font-size: 0.8rem;
+ padding: 12px;
+ max-height: 300px;
+ }
+}
+
+/* Tablet styles */
+@media (max-width: 1024px) and (min-width: 769px) {
+ .container {
+ padding: 16px;
+ max-width: 100%;
+ }
+
+ .form-grid {
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+ gap: 14px;
+ }
+
+ .categories-grid {
+ grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
+ }
+
+ .task-options .tasks-list {
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+ }
+
+ .results-grid {
+ grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
+ }
+
+ .model-actions {
+ flex-wrap: wrap;
+ }
+
+ .preview-actions {
+ flex-wrap: wrap;
+ }
+}
+
+/* Large desktop styles */
+@media (min-width: 1201px) {
+ .container {
+ max-width: 1400px;
+ }
+
+ .form-grid {
+ grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
+ }
+
+ .categories-grid {
+ grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
+ }
+
+ .task-options .tasks-list {
+ grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
+ }
+}
+
+/* Small mobile styles */
+@media (max-width: 480px) {
+ .container {
+ padding: 8px;
+ }
+
+ header {
+ padding: 20px 12px;
+ border-radius: 8px;
+ }
+
+ header h1 {
+ font-size: 1.8rem;
+ }
+
+ header p {
+ font-size: 0.95rem;
+ }
+
+ .card {
+ padding: 12px;
+ border-radius: 8px;
+ }
+
+ .card h2 {
+ font-size: 1.2rem;
+ margin-bottom: 12px;
+ }
+
+ .section-description {
+ font-size: 0.9rem;
+ margin-bottom: 16px;
+ }
+
+ .model-config-item {
+ padding: 12px;
+ border-radius: 8px;
+ }
+
+ .model-config-item h3 {
+ font-size: 1.1rem;
+ margin-bottom: 12px;
+ }
+
+ .form-group input,
+ .form-group select,
+ .form-group textarea {
+ padding: 12px;
+ font-size: 1rem;
+ }
+
+ .task-item {
+ padding: 12px 8px;
+ }
+
+ .task-item label {
+ font-size: 1rem;
+ }
+
+ .metric-chip {
+ font-size: 0.8rem;
+ padding: 4px 10px;
+ }
+
+ .selected-task-item {
+ padding: 12px;
+ }
+
+ #config-preview {
+ font-size: 0.75rem;
+ padding: 10px;
+ border-radius: 6px;
+ }
+}
+
+/* Advanced config styling */
+.advanced-config {
+ margin-top: 16px;
+ border: 1px solid var(--border-color);
+ border-radius: 8px;
+}
+
+.advanced-config summary {
+ padding: 12px 16px;
+ background: var(--background);
+ cursor: pointer;
+ font-weight: 500;
+ color: var(--text-primary);
+ border-radius: 8px 8px 0 0;
+}
+
+.advanced-config summary:hover {
+ background: #f1f5f9;
+}
+
+.advanced-config[open] summary {
+ border-bottom: 1px solid var(--border-color);
+ border-radius: 8px 8px 0 0;
+ margin-bottom: 0;
+}
+
+.advanced-config .form-grid {
+ padding: 16px;
+}
+
+/* Task selection UI */
+.task-selection-ui {
+ margin-bottom: 24px;
+}
+
+.task-options {
+ margin-top: 16px;
+ padding: 16px;
+ background: var(--background);
+ border-radius: 8px;
+ border: 1px solid var(--border-color);
+}
+
+.task-options h4 {
+ margin: 0 0 8px 0;
+ color: var(--primary-color);
+ font-size: 1.1rem;
+}
+
+.task-options p {
+ margin: 0 0 16px 0;
+ color: var(--text-secondary);
+ font-size: 0.9rem;
+}
+
+.task-options .tasks-list {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+ gap: 12px;
+}
+
+.category-group {
+ margin-bottom: 32px;
+ padding-bottom: 24px;
+ border-bottom: 1px solid var(--border-color);
+}
+
+.category-group:last-child {
+ border-bottom: none;
+ margin-bottom: 0;
+}
+
+.category-group h4 {
+ margin: 0 0 8px 0;
+ color: var(--primary-color);
+ font-size: 1.2rem;
+}
+
+.category-group p {
+ margin: 0 0 16px 0;
+ color: var(--text-secondary);
+ font-size: 0.95rem;
+}
diff --git a/ui/tasks.js b/ui/tasks.js
new file mode 100644
index 0000000..3369cea
--- /dev/null
+++ b/ui/tasks.js
@@ -0,0 +1,1318 @@
+// Auto-generated by generate_tasks.py
+// This file allows the UI to work when opening index.html directly in a browser
+window.TASKS_DATA = {
+ "audio_understanding": {
+ "name": "🔊 Audio Understanding",
+ "description": "Tasks that require understanding of the general audio signals including but not limited to music, noise, sound.",
+ "tasks": {
+ "music_understanding": {
+ "name": "Music Understanding",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mu_chomusic_test"
+ ]
+ },
+ "scene_understanding": {
+ "name": "Scene Understanding",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "audiocaps_qa_test",
+ "audiocaps_test",
+ "clotho_aqa_test",
+ "wavcaps_qa_test",
+ "wavcaps_test"
+ ]
+ }
+ }
+ },
+ "paralinguistics": {
+ "name": "🎭 Paralinguistics",
+ "description": "Tasks that analyze non-verbal aspects of speech such as emotion, gender, accent, and speaker characteristics.",
+ "tasks": {
+ "accent_recognition": {
+ "name": "Accent Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mnsc_pqa_ar_dialogue_test",
+ "mnsc_pqa_ar_sentence_test",
+ "voxceleb_accent_test"
+ ]
+ },
+ "emotion_recognition": {
+ "name": "Emotion Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_emotion_recognition",
+ "meld_emotion_test",
+ "meld_sentiment_test"
+ ]
+ },
+ "gender_recognition": {
+ "name": "Gender Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_gender_recognition",
+ "mnsc_pqa_gr_dialogue_test",
+ "mnsc_pqa_gr_sentence_test",
+ "voxceleb_gender_test"
+ ]
+ },
+ "speaker_diarization": {
+ "name": "Speaker Diarization",
+ "metrics": [
+ "diarization_metrics"
+ ],
+ "configs": [
+ "callhome_diarization_deu",
+ "callhome_diarization_eng",
+ "callhome_diarization_jpn",
+ "callhome_diarization_spa",
+ "callhome_diarization_zho"
+ ]
+ },
+ "speaker_recognition": {
+ "name": "Speaker Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mmau_mini"
+ ]
+ }
+ }
+ },
+ "phonetics": {
+ "name": "📢 Phonetics",
+ "description": "Tasks related to phonetic analysis, phoneme recognition, and speech sound processing.",
+ "tasks": {
+ "phonemes": {
+ "name": "Phonemes",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "voxangeles_phoneme_counting"
+ ]
+ }
+ }
+ },
+ "safety_and_security": {
+ "name": "🔐 Safety and Security",
+ "description": "Tasks related to assessing model behavior around safety, robustness, and vulnerability to spoofing or adversarial content.",
+ "tasks": {
+ "safety": {
+ "name": "Safety",
+ "metrics": [
+ "llm_judge_redteaming"
+ ],
+ "configs": [
+ "advbench"
+ ]
+ },
+ "spoofing": {
+ "name": "Spoofing",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "asvspoof"
+ ]
+ }
+ }
+ },
+ "speech_disorder": {
+ "name": "🩺 Speech Disorder",
+ "description": "Tasks related to detecting and analyzing speech disorders and voice pathologies.",
+ "tasks": {
+ "voice_disorder": {
+ "name": "Voice Disorder",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "stuttering_detection"
+ ]
+ }
+ }
+ },
+ "speech_enhancement": {
+ "name": "✨ Speech Enhancement",
+ "description": "Tasks related to speech quality improvement, noise detection, and audio enhancement.",
+ "tasks": {
+ "noise_detection": {
+ "name": "Noise Detection",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "noise_detection"
+ ]
+ }
+ }
+ },
+ "speech_recognition": {
+ "name": "🗣️ Speech Recognition",
+ "description": "Tasks involving automatic speech recognition (ASR), including standard ASR, long-form ASR, and code-switching ASR.",
+ "tasks": {
+ "asr": {
+ "name": "Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "aishell_1_test",
+ "ami_ihm",
+ "ami_sdm",
+ "callhome_asr_deu",
+ "callhome_asr_eng",
+ "callhome_asr_jpn",
+ "callhome_asr_spa",
+ "callhome_asr_zho",
+ "common_voice_15_ab",
+ "common_voice_15_af",
+ "common_voice_15_am",
+ "common_voice_15_ar",
+ "common_voice_15_as",
+ "common_voice_15_ast",
+ "common_voice_15_az",
+ "common_voice_15_ba",
+ "common_voice_15_bas",
+ "common_voice_15_be",
+ "common_voice_15_bg",
+ "common_voice_15_bn",
+ "common_voice_15_br",
+ "common_voice_15_ca",
+ "common_voice_15_ckb",
+ "common_voice_15_cnh",
+ "common_voice_15_cs",
+ "common_voice_15_cv",
+ "common_voice_15_cy",
+ "common_voice_15_da",
+ "common_voice_15_de",
+ "common_voice_15_dv",
+ "common_voice_15_dyu",
+ "common_voice_15_el",
+ "common_voice_15_en",
+ "common_voice_15_eo",
+ "common_voice_15_es",
+ "common_voice_15_et",
+ "common_voice_15_eu",
+ "common_voice_15_fa",
+ "common_voice_15_fi",
+ "common_voice_15_fr",
+ "common_voice_15_fy-NL",
+ "common_voice_15_ga-IE",
+ "common_voice_15_gl",
+ "common_voice_15_ha",
+ "common_voice_15_hi",
+ "common_voice_15_hsb",
+ "common_voice_15_hu",
+ "common_voice_15_hy-AM",
+ "common_voice_15_ia",
+ "common_voice_15_id",
+ "common_voice_15_ig",
+ "common_voice_15_it",
+ "common_voice_15_ja",
+ "common_voice_15_ka",
+ "common_voice_15_kab",
+ "common_voice_15_kk",
+ "common_voice_15_kmr",
+ "common_voice_15_ko",
+ "common_voice_15_ky",
+ "common_voice_15_lg",
+ "common_voice_15_lt",
+ "common_voice_15_lv",
+ "common_voice_15_mdf",
+ "common_voice_15_mg",
+ "common_voice_15_mk",
+ "common_voice_15_ml",
+ "common_voice_15_mn",
+ "common_voice_15_mr",
+ "common_voice_15_mt",
+ "common_voice_15_myv",
+ "common_voice_15_ne-NP",
+ "common_voice_15_nl",
+ "common_voice_15_nn-NO",
+ "common_voice_15_or",
+ "common_voice_15_pa-IN",
+ "common_voice_15_pl",
+ "common_voice_15_pt",
+ "common_voice_15_rm-sursilv",
+ "common_voice_15_rm-vallader",
+ "common_voice_15_ro",
+ "common_voice_15_ru",
+ "common_voice_15_rw",
+ "common_voice_15_sah",
+ "common_voice_15_sat",
+ "common_voice_15_sc",
+ "common_voice_15_sk",
+ "common_voice_15_sl",
+ "common_voice_15_sr",
+ "common_voice_15_sv-SE",
+ "common_voice_15_sw",
+ "common_voice_15_ta",
+ "common_voice_15_te",
+ "common_voice_15_tg",
+ "common_voice_15_th",
+ "common_voice_15_ti",
+ "common_voice_15_tok",
+ "common_voice_15_tr",
+ "common_voice_15_tt",
+ "common_voice_15_ug",
+ "common_voice_15_uk",
+ "common_voice_15_ur",
+ "common_voice_15_uz",
+ "common_voice_15_vot",
+ "common_voice_15_yi",
+ "common_voice_15_yue",
+ "common_voice_15_zh-CN",
+ "common_voice_15_zh-HK",
+ "common_voice_15_zh-TW",
+ "fleurs_af_za",
+ "fleurs_am_et",
+ "fleurs_ar_eg",
+ "fleurs_as_in",
+ "fleurs_ast_es",
+ "fleurs_az_az",
+ "fleurs_be_by",
+ "fleurs_bg_bg",
+ "fleurs_bn_in",
+ "fleurs_bs_ba",
+ "fleurs_ca_es",
+ "fleurs_ceb_ph",
+ "fleurs_ckb_iq",
+ "fleurs_cmn_hans_cn",
+ "fleurs_cs_cz",
+ "fleurs_cy_gb",
+ "fleurs_da_dk",
+ "fleurs_de_de",
+ "fleurs_el_gr",
+ "fleurs_en_us",
+ "fleurs_es_419",
+ "fleurs_et_ee",
+ "fleurs_fa_ir",
+ "fleurs_ff_sn",
+ "fleurs_fi_fi",
+ "fleurs_fil_ph",
+ "fleurs_fr_fr",
+ "fleurs_ga_ie",
+ "fleurs_gl_es",
+ "fleurs_gu_in",
+ "fleurs_ha_ng",
+ "fleurs_he_il",
+ "fleurs_hi_in",
+ "fleurs_hr_hr",
+ "fleurs_hu_hu",
+ "fleurs_hy_am",
+ "fleurs_id_id",
+ "fleurs_ig_ng",
+ "fleurs_is_is",
+ "fleurs_it_it",
+ "fleurs_ja_jp",
+ "fleurs_jv_id",
+ "fleurs_ka_ge",
+ "fleurs_kam_ke",
+ "fleurs_kea_cv",
+ "fleurs_kk_kz",
+ "fleurs_km_kh",
+ "fleurs_kn_in",
+ "fleurs_ko_kr",
+ "fleurs_ky_kg",
+ "fleurs_lb_lu",
+ "fleurs_lg_ug",
+ "fleurs_ln_cd",
+ "fleurs_lo_la",
+ "fleurs_lt_lt",
+ "fleurs_luo_ke",
+ "fleurs_lv_lv",
+ "fleurs_mi_nz",
+ "fleurs_mk_mk",
+ "fleurs_ml_in",
+ "fleurs_mn_mn",
+ "fleurs_mr_in",
+ "fleurs_ms_my",
+ "fleurs_mt_mt",
+ "fleurs_my_mm",
+ "fleurs_nb_no",
+ "fleurs_ne_np",
+ "fleurs_nl_nl",
+ "fleurs_nso_za",
+ "fleurs_ny_mw",
+ "fleurs_oc_fr",
+ "fleurs_om_et",
+ "fleurs_or_in",
+ "fleurs_pa_in",
+ "fleurs_pl_pl",
+ "fleurs_ps_af",
+ "fleurs_pt_br",
+ "fleurs_ro_ro",
+ "fleurs_ru_ru",
+ "fleurs_sd_in",
+ "fleurs_sk_sk",
+ "fleurs_sl_si",
+ "fleurs_sn_zw",
+ "fleurs_so_so",
+ "fleurs_sr_rs",
+ "fleurs_sv_se",
+ "fleurs_sw_ke",
+ "fleurs_ta_in",
+ "fleurs_te_in",
+ "fleurs_tg_tj",
+ "fleurs_th_th",
+ "fleurs_tr_tr",
+ "fleurs_uk_ua",
+ "fleurs_umb_ao",
+ "fleurs_ur_pk",
+ "fleurs_uz_uz",
+ "fleurs_vi_vn",
+ "fleurs_wo_sn",
+ "fleurs_xh_za",
+ "fleurs_yo_ng",
+ "fleurs_yue_hant_hk",
+ "fleurs_zu_za",
+ "gigaspeech_test",
+ "gigaspeech2_id_test",
+ "gigaspeech2_th_test",
+ "gigaspeech2_vi_test",
+ "librispeech_test_clean",
+ "librispeech_test_other",
+ "librispeech_multilingual_dutch",
+ "librispeech_multilingual_french",
+ "librispeech_multilingual_german",
+ "librispeech_multilingual_italian",
+ "librispeech_multilingual_polish",
+ "librispeech_multilingual_portuguese",
+ "librispeech_multilingual_spanish",
+ "mnsc_asr_part1_test",
+ "mnsc_asr_part2_test",
+ "mnsc_asr_part3_test",
+ "mnsc_asr_part4_test",
+ "mnsc_asr_part5_test",
+ "mnsc_asr_part6_test",
+ "peoples_speech_test",
+ "spgispeech_test",
+ "tedlium3_test",
+ "voxpopuli_cs",
+ "voxpopuli_de",
+ "voxpopuli_en",
+ "voxpopuli_en_accented",
+ "voxpopuli_es",
+ "voxpopuli_et",
+ "voxpopuli_fi",
+ "voxpopuli_fr",
+ "voxpopuli_hr",
+ "voxpopuli_hu",
+ "voxpopuli_it",
+ "voxpopuli_lt",
+ "voxpopuli_nl",
+ "voxpopuli_pl",
+ "voxpopuli_ro",
+ "voxpopuli_sk",
+ "voxpopuli_sl"
+ ]
+ },
+ "code_switching_asr": {
+ "name": "Code Switching Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "seame_dev_man",
+ "seame_dev_sge"
+ ]
+ },
+ "long_form_asr": {
+ "name": "Long Form Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "earnings21",
+ "earnings22",
+ "tedlium3_long_form"
+ ]
+ }
+ }
+ },
+ "spoken_language_reasoning": {
+ "name": "🧩 Spoken Language Reasoning",
+ "description": "Tasks that require reasoning over spoken input, such as instruction following or logical/mathematical reasoning.",
+ "tasks": {
+ "bfcl": {
+ "name": "Bfcl",
+ "metrics": [
+ "bfcl_match_score"
+ ],
+ "configs": [
+ "bfcl_audio_irrelevance",
+ "bfcl_audio_multiple",
+ "bfcl_audio_parallel",
+ "bfcl_audio_parallel_multiple",
+ "bfcl_audio_simple",
+ "bfcl_audio_irrelevance_no_prompt",
+ "bfcl_audio_multiple_no_prompt",
+ "bfcl_audio_parallel_multiple_no_prompt",
+ "bfcl_audio_parallel_no_prompt",
+ "bfcl_audio_simple_no_prompt",
+ "bfcl_text_irrelevance",
+ "bfcl_text_multiple",
+ "bfcl_text_parallel",
+ "bfcl_text_parallel_multiple",
+ "bfcl_text_simple",
+ "bfcl_text_irrelevance_no_prompt",
+ "bfcl_text_multiple_no_prompt",
+ "bfcl_text_parallel_multiple_no_prompt",
+ "bfcl_text_parallel_no_prompt",
+ "bfcl_text_simple_no_prompt"
+ ]
+ },
+ "gsm8k": {
+ "name": "Gsm8K",
+ "metrics": [
+ "gsm8k_exact_match"
+ ],
+ "configs": [
+ "gsm8k_audio",
+ "gsm8k_text"
+ ]
+ },
+ "ifeval": {
+ "name": "Ifeval",
+ "metrics": [
+ "instruction_following"
+ ],
+ "configs": [
+ "voicebench_ifeval_audio",
+ "voicebench_ifeval_text"
+ ]
+ },
+ "mtbench": {
+ "name": "Mtbench",
+ "metrics": [
+ "mt_bench_llm_judge"
+ ],
+ "configs": [
+ "mtbench_audio",
+ "mtbench_text"
+ ]
+ },
+ "speech_to_sql": {
+ "name": "Speech To Sql",
+ "metrics": [
+ "sql_score"
+ ],
+ "configs": [
+ "spider_audio",
+ "spider_text"
+ ]
+ }
+ }
+ },
+ "spoken_language_understanding": {
+ "name": "🧠 Spoken Language Understanding",
+ "description": "Tasks that require understanding of spoken language and/or audio information including QA, translation, summarization, and intent classification.",
+ "tasks": {
+ "intent_classification": {
+ "name": "Intent Classification",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "SLURP-intent"
+ ]
+ },
+ "speech_qa": {
+ "name": "Speech Qa",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "alpaca_audio_test",
+ "cn_college_listen_mcq_test",
+ "dream_tts_mcq_test",
+ "mnsc_sqa_part3_test",
+ "mnsc_sqa_part4_test",
+ "mnsc_sqa_part5_test",
+ "mnsc_sqa_part6_test",
+ "openhermes_instruction_test",
+ "public_sg_speech_qa_test",
+ "slue_p2_sqa5_test",
+ "spoken_squad_test"
+ ]
+ },
+ "spoken_dialogue": {
+ "name": "Spoken Dialogue",
+ "metrics": [
+ "joint_goal_accuracy",
+ "slot_accuracy",
+ "slot_f1",
+ "bleu"
+ ],
+ "configs": [
+ "spokenwoz_audio",
+ "spokenwoz_text"
+ ]
+ },
+ "spoken_dialogue_summarization": {
+ "name": "Spoken Dialogue Summarization",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "mnsc_sds_part3_test",
+ "mnsc_sds_part4_test",
+ "mnsc_sds_part5_test"
+ ]
+ },
+ "sqqa": {
+ "name": "Sqqa",
+ "metrics": [
+ "llm_judge_big_bench_audio"
+ ],
+ "configs": [
+ "big_bench_audio_audio_query",
+ "big_bench_audio_text_query",
+ "mmsu_biology",
+ "mmsu_business",
+ "mmsu_chemistry",
+ "mmsu_economics",
+ "mmsu_engineering",
+ "mmsu_health",
+ "mmsu_history",
+ "mmsu_law",
+ "mmsu_other",
+ "mmsu_philosophy",
+ "mmsu_physics",
+ "mmsu_psychology",
+ "openbookqa",
+ "sd-qa_aus_audio",
+ "sd-qa_aus_text",
+ "sd-qa_gbr_audio",
+ "sd-qa_gbr_text",
+ "sd-qa_ind_n_audio",
+ "sd-qa_ind_n_text",
+ "sd-qa_ind_s_audio",
+ "sd-qa_ind_s_text",
+ "sd-qa_irl_audio",
+ "sd-qa_irl_text",
+ "sd-qa_kenya_audio",
+ "sd-qa_kenya_text",
+ "sd-qa_nga_audio",
+ "sd-qa_nga_text",
+ "sd-qa_nzl_audio",
+ "sd-qa_nzl_text",
+ "sd-qa_phl_audio",
+ "sd-qa_phl_text",
+ "sd-qa_usa_audio",
+ "sd-qa_usa_text",
+ "sd-qa_zaf_audio",
+ "sd-qa_zaf_text"
+ ]
+ },
+ "translation": {
+ "name": "Translation",
+ "metrics": [
+ "bleu",
+ "meteor",
+ "bertscore",
+ "comet"
+ ],
+ "configs": [
+ "covost2_ar_en",
+ "covost2_ca_en",
+ "covost2_cy_en",
+ "covost2_de_en",
+ "covost2_en_ar",
+ "covost2_en_ca",
+ "covost2_en_cy",
+ "covost2_en_de",
+ "covost2_en_et",
+ "covost2_en_fa",
+ "covost2_en_id",
+ "covost2_en_ja",
+ "covost2_en_lv",
+ "covost2_en_mn",
+ "covost2_en_sl",
+ "covost2_en_sv-SE",
+ "covost2_en_ta",
+ "covost2_en_tr",
+ "covost2_en_zh-CN",
+ "covost2_es_en",
+ "covost2_et_en",
+ "covost2_fa_en",
+ "covost2_fr_en",
+ "covost2_id_en",
+ "covost2_it_en",
+ "covost2_ja_en",
+ "covost2_lv_en",
+ "covost2_mn_en",
+ "covost2_nl_en",
+ "covost2_pt_en",
+ "covost2_ru_en",
+ "covost2_sl_en",
+ "covost2_sv-SE_en",
+ "covost2_ta_en",
+ "covost2_tr_en",
+ "covost2_zh-CN_en"
+ ]
+ }
+ }
+ },
+ "music_understanding": {
+ "name": "Music Understanding",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mu_chomusic_test"
+ ],
+ "category": "audio_understanding"
+ },
+ "scene_understanding": {
+ "name": "Scene Understanding",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "audiocaps_qa_test",
+ "audiocaps_test",
+ "clotho_aqa_test",
+ "wavcaps_qa_test",
+ "wavcaps_test"
+ ],
+ "category": "audio_understanding"
+ },
+ "accent_recognition": {
+ "name": "Accent Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mnsc_pqa_ar_dialogue_test",
+ "mnsc_pqa_ar_sentence_test",
+ "voxceleb_accent_test"
+ ],
+ "category": "paralinguistics"
+ },
+ "emotion_recognition": {
+ "name": "Emotion Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_emotion_recognition",
+ "meld_emotion_test",
+ "meld_sentiment_test"
+ ],
+ "category": "paralinguistics"
+ },
+ "gender_recognition": {
+ "name": "Gender Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_gender_recognition",
+ "mnsc_pqa_gr_dialogue_test",
+ "mnsc_pqa_gr_sentence_test",
+ "voxceleb_gender_test"
+ ],
+ "category": "paralinguistics"
+ },
+ "speaker_diarization": {
+ "name": "Speaker Diarization",
+ "metrics": [
+ "diarization_metrics"
+ ],
+ "configs": [
+ "callhome_diarization_deu",
+ "callhome_diarization_eng",
+ "callhome_diarization_jpn",
+ "callhome_diarization_spa",
+ "callhome_diarization_zho"
+ ],
+ "category": "paralinguistics"
+ },
+ "speaker_recognition": {
+ "name": "Speaker Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mmau_mini"
+ ],
+ "category": "paralinguistics"
+ },
+ "phonemes": {
+ "name": "Phonemes",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "voxangeles_phoneme_counting"
+ ],
+ "category": "phonetics"
+ },
+ "safety": {
+ "name": "Safety",
+ "metrics": [
+ "llm_judge_redteaming"
+ ],
+ "configs": [
+ "advbench"
+ ],
+ "category": "safety_and_security"
+ },
+ "spoofing": {
+ "name": "Spoofing",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "asvspoof"
+ ],
+ "category": "safety_and_security"
+ },
+ "voice_disorder": {
+ "name": "Voice Disorder",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "stuttering_detection"
+ ],
+ "category": "speech_disorder"
+ },
+ "noise_detection": {
+ "name": "Noise Detection",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "noise_detection"
+ ],
+ "category": "speech_enhancement"
+ },
+ "asr": {
+ "name": "Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "aishell_1_test",
+ "ami_ihm",
+ "ami_sdm",
+ "callhome_asr_deu",
+ "callhome_asr_eng",
+ "callhome_asr_jpn",
+ "callhome_asr_spa",
+ "callhome_asr_zho",
+ "common_voice_15_ab",
+ "common_voice_15_af",
+ "common_voice_15_am",
+ "common_voice_15_ar",
+ "common_voice_15_as",
+ "common_voice_15_ast",
+ "common_voice_15_az",
+ "common_voice_15_ba",
+ "common_voice_15_bas",
+ "common_voice_15_be",
+ "common_voice_15_bg",
+ "common_voice_15_bn",
+ "common_voice_15_br",
+ "common_voice_15_ca",
+ "common_voice_15_ckb",
+ "common_voice_15_cnh",
+ "common_voice_15_cs",
+ "common_voice_15_cv",
+ "common_voice_15_cy",
+ "common_voice_15_da",
+ "common_voice_15_de",
+ "common_voice_15_dv",
+ "common_voice_15_dyu",
+ "common_voice_15_el",
+ "common_voice_15_en",
+ "common_voice_15_eo",
+ "common_voice_15_es",
+ "common_voice_15_et",
+ "common_voice_15_eu",
+ "common_voice_15_fa",
+ "common_voice_15_fi",
+ "common_voice_15_fr",
+ "common_voice_15_fy-NL",
+ "common_voice_15_ga-IE",
+ "common_voice_15_gl",
+ "common_voice_15_ha",
+ "common_voice_15_hi",
+ "common_voice_15_hsb",
+ "common_voice_15_hu",
+ "common_voice_15_hy-AM",
+ "common_voice_15_ia",
+ "common_voice_15_id",
+ "common_voice_15_ig",
+ "common_voice_15_it",
+ "common_voice_15_ja",
+ "common_voice_15_ka",
+ "common_voice_15_kab",
+ "common_voice_15_kk",
+ "common_voice_15_kmr",
+ "common_voice_15_ko",
+ "common_voice_15_ky",
+ "common_voice_15_lg",
+ "common_voice_15_lt",
+ "common_voice_15_lv",
+ "common_voice_15_mdf",
+ "common_voice_15_mg",
+ "common_voice_15_mk",
+ "common_voice_15_ml",
+ "common_voice_15_mn",
+ "common_voice_15_mr",
+ "common_voice_15_mt",
+ "common_voice_15_myv",
+ "common_voice_15_ne-NP",
+ "common_voice_15_nl",
+ "common_voice_15_nn-NO",
+ "common_voice_15_or",
+ "common_voice_15_pa-IN",
+ "common_voice_15_pl",
+ "common_voice_15_pt",
+ "common_voice_15_rm-sursilv",
+ "common_voice_15_rm-vallader",
+ "common_voice_15_ro",
+ "common_voice_15_ru",
+ "common_voice_15_rw",
+ "common_voice_15_sah",
+ "common_voice_15_sat",
+ "common_voice_15_sc",
+ "common_voice_15_sk",
+ "common_voice_15_sl",
+ "common_voice_15_sr",
+ "common_voice_15_sv-SE",
+ "common_voice_15_sw",
+ "common_voice_15_ta",
+ "common_voice_15_te",
+ "common_voice_15_tg",
+ "common_voice_15_th",
+ "common_voice_15_ti",
+ "common_voice_15_tok",
+ "common_voice_15_tr",
+ "common_voice_15_tt",
+ "common_voice_15_ug",
+ "common_voice_15_uk",
+ "common_voice_15_ur",
+ "common_voice_15_uz",
+ "common_voice_15_vot",
+ "common_voice_15_yi",
+ "common_voice_15_yue",
+ "common_voice_15_zh-CN",
+ "common_voice_15_zh-HK",
+ "common_voice_15_zh-TW",
+ "fleurs_af_za",
+ "fleurs_am_et",
+ "fleurs_ar_eg",
+ "fleurs_as_in",
+ "fleurs_ast_es",
+ "fleurs_az_az",
+ "fleurs_be_by",
+ "fleurs_bg_bg",
+ "fleurs_bn_in",
+ "fleurs_bs_ba",
+ "fleurs_ca_es",
+ "fleurs_ceb_ph",
+ "fleurs_ckb_iq",
+ "fleurs_cmn_hans_cn",
+ "fleurs_cs_cz",
+ "fleurs_cy_gb",
+ "fleurs_da_dk",
+ "fleurs_de_de",
+ "fleurs_el_gr",
+ "fleurs_en_us",
+ "fleurs_es_419",
+ "fleurs_et_ee",
+ "fleurs_fa_ir",
+ "fleurs_ff_sn",
+ "fleurs_fi_fi",
+ "fleurs_fil_ph",
+ "fleurs_fr_fr",
+ "fleurs_ga_ie",
+ "fleurs_gl_es",
+ "fleurs_gu_in",
+ "fleurs_ha_ng",
+ "fleurs_he_il",
+ "fleurs_hi_in",
+ "fleurs_hr_hr",
+ "fleurs_hu_hu",
+ "fleurs_hy_am",
+ "fleurs_id_id",
+ "fleurs_ig_ng",
+ "fleurs_is_is",
+ "fleurs_it_it",
+ "fleurs_ja_jp",
+ "fleurs_jv_id",
+ "fleurs_ka_ge",
+ "fleurs_kam_ke",
+ "fleurs_kea_cv",
+ "fleurs_kk_kz",
+ "fleurs_km_kh",
+ "fleurs_kn_in",
+ "fleurs_ko_kr",
+ "fleurs_ky_kg",
+ "fleurs_lb_lu",
+ "fleurs_lg_ug",
+ "fleurs_ln_cd",
+ "fleurs_lo_la",
+ "fleurs_lt_lt",
+ "fleurs_luo_ke",
+ "fleurs_lv_lv",
+ "fleurs_mi_nz",
+ "fleurs_mk_mk",
+ "fleurs_ml_in",
+ "fleurs_mn_mn",
+ "fleurs_mr_in",
+ "fleurs_ms_my",
+ "fleurs_mt_mt",
+ "fleurs_my_mm",
+ "fleurs_nb_no",
+ "fleurs_ne_np",
+ "fleurs_nl_nl",
+ "fleurs_nso_za",
+ "fleurs_ny_mw",
+ "fleurs_oc_fr",
+ "fleurs_om_et",
+ "fleurs_or_in",
+ "fleurs_pa_in",
+ "fleurs_pl_pl",
+ "fleurs_ps_af",
+ "fleurs_pt_br",
+ "fleurs_ro_ro",
+ "fleurs_ru_ru",
+ "fleurs_sd_in",
+ "fleurs_sk_sk",
+ "fleurs_sl_si",
+ "fleurs_sn_zw",
+ "fleurs_so_so",
+ "fleurs_sr_rs",
+ "fleurs_sv_se",
+ "fleurs_sw_ke",
+ "fleurs_ta_in",
+ "fleurs_te_in",
+ "fleurs_tg_tj",
+ "fleurs_th_th",
+ "fleurs_tr_tr",
+ "fleurs_uk_ua",
+ "fleurs_umb_ao",
+ "fleurs_ur_pk",
+ "fleurs_uz_uz",
+ "fleurs_vi_vn",
+ "fleurs_wo_sn",
+ "fleurs_xh_za",
+ "fleurs_yo_ng",
+ "fleurs_yue_hant_hk",
+ "fleurs_zu_za",
+ "gigaspeech_test",
+ "gigaspeech2_id_test",
+ "gigaspeech2_th_test",
+ "gigaspeech2_vi_test",
+ "librispeech_test_clean",
+ "librispeech_test_other",
+ "librispeech_multilingual_dutch",
+ "librispeech_multilingual_french",
+ "librispeech_multilingual_german",
+ "librispeech_multilingual_italian",
+ "librispeech_multilingual_polish",
+ "librispeech_multilingual_portuguese",
+ "librispeech_multilingual_spanish",
+ "mnsc_asr_part1_test",
+ "mnsc_asr_part2_test",
+ "mnsc_asr_part3_test",
+ "mnsc_asr_part4_test",
+ "mnsc_asr_part5_test",
+ "mnsc_asr_part6_test",
+ "peoples_speech_test",
+ "spgispeech_test",
+ "tedlium3_test",
+ "voxpopuli_cs",
+ "voxpopuli_de",
+ "voxpopuli_en",
+ "voxpopuli_en_accented",
+ "voxpopuli_es",
+ "voxpopuli_et",
+ "voxpopuli_fi",
+ "voxpopuli_fr",
+ "voxpopuli_hr",
+ "voxpopuli_hu",
+ "voxpopuli_it",
+ "voxpopuli_lt",
+ "voxpopuli_nl",
+ "voxpopuli_pl",
+ "voxpopuli_ro",
+ "voxpopuli_sk",
+ "voxpopuli_sl"
+ ],
+ "category": "speech_recognition"
+ },
+ "code_switching_asr": {
+ "name": "Code Switching Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "seame_dev_man",
+ "seame_dev_sge"
+ ],
+ "category": "speech_recognition"
+ },
+ "long_form_asr": {
+ "name": "Long Form Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "earnings21",
+ "earnings22",
+ "tedlium3_long_form"
+ ],
+ "category": "speech_recognition"
+ },
+ "bfcl": {
+ "name": "Bfcl",
+ "metrics": [
+ "bfcl_match_score"
+ ],
+ "configs": [
+ "bfcl_audio_irrelevance",
+ "bfcl_audio_multiple",
+ "bfcl_audio_parallel",
+ "bfcl_audio_parallel_multiple",
+ "bfcl_audio_simple",
+ "bfcl_audio_irrelevance_no_prompt",
+ "bfcl_audio_multiple_no_prompt",
+ "bfcl_audio_parallel_multiple_no_prompt",
+ "bfcl_audio_parallel_no_prompt",
+ "bfcl_audio_simple_no_prompt",
+ "bfcl_text_irrelevance",
+ "bfcl_text_multiple",
+ "bfcl_text_parallel",
+ "bfcl_text_parallel_multiple",
+ "bfcl_text_simple",
+ "bfcl_text_irrelevance_no_prompt",
+ "bfcl_text_multiple_no_prompt",
+ "bfcl_text_parallel_multiple_no_prompt",
+ "bfcl_text_parallel_no_prompt",
+ "bfcl_text_simple_no_prompt"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "gsm8k": {
+ "name": "Gsm8K",
+ "metrics": [
+ "gsm8k_exact_match"
+ ],
+ "configs": [
+ "gsm8k_audio",
+ "gsm8k_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "ifeval": {
+ "name": "Ifeval",
+ "metrics": [
+ "instruction_following"
+ ],
+ "configs": [
+ "voicebench_ifeval_audio",
+ "voicebench_ifeval_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "mtbench": {
+ "name": "Mtbench",
+ "metrics": [
+ "mt_bench_llm_judge"
+ ],
+ "configs": [
+ "mtbench_audio",
+ "mtbench_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "speech_to_sql": {
+ "name": "Speech To Sql",
+ "metrics": [
+ "sql_score"
+ ],
+ "configs": [
+ "spider_audio",
+ "spider_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "intent_classification": {
+ "name": "Intent Classification",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "SLURP-intent"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "speech_qa": {
+ "name": "Speech Qa",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "alpaca_audio_test",
+ "cn_college_listen_mcq_test",
+ "dream_tts_mcq_test",
+ "mnsc_sqa_part3_test",
+ "mnsc_sqa_part4_test",
+ "mnsc_sqa_part5_test",
+ "mnsc_sqa_part6_test",
+ "openhermes_instruction_test",
+ "public_sg_speech_qa_test",
+ "slue_p2_sqa5_test",
+ "spoken_squad_test"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "spoken_dialogue": {
+ "name": "Spoken Dialogue",
+ "metrics": [
+ "joint_goal_accuracy",
+ "slot_accuracy",
+ "slot_f1",
+ "bleu"
+ ],
+ "configs": [
+ "spokenwoz_audio",
+ "spokenwoz_text"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "spoken_dialogue_summarization": {
+ "name": "Spoken Dialogue Summarization",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "mnsc_sds_part3_test",
+ "mnsc_sds_part4_test",
+ "mnsc_sds_part5_test"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "sqqa": {
+ "name": "Sqqa",
+ "metrics": [
+ "llm_judge_big_bench_audio"
+ ],
+ "configs": [
+ "big_bench_audio_audio_query",
+ "big_bench_audio_text_query",
+ "mmsu_biology",
+ "mmsu_business",
+ "mmsu_chemistry",
+ "mmsu_economics",
+ "mmsu_engineering",
+ "mmsu_health",
+ "mmsu_history",
+ "mmsu_law",
+ "mmsu_other",
+ "mmsu_philosophy",
+ "mmsu_physics",
+ "mmsu_psychology",
+ "openbookqa",
+ "sd-qa_aus_audio",
+ "sd-qa_aus_text",
+ "sd-qa_gbr_audio",
+ "sd-qa_gbr_text",
+ "sd-qa_ind_n_audio",
+ "sd-qa_ind_n_text",
+ "sd-qa_ind_s_audio",
+ "sd-qa_ind_s_text",
+ "sd-qa_irl_audio",
+ "sd-qa_irl_text",
+ "sd-qa_kenya_audio",
+ "sd-qa_kenya_text",
+ "sd-qa_nga_audio",
+ "sd-qa_nga_text",
+ "sd-qa_nzl_audio",
+ "sd-qa_nzl_text",
+ "sd-qa_phl_audio",
+ "sd-qa_phl_text",
+ "sd-qa_usa_audio",
+ "sd-qa_usa_text",
+ "sd-qa_zaf_audio",
+ "sd-qa_zaf_text"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "translation": {
+ "name": "Translation",
+ "metrics": [
+ "bleu",
+ "meteor",
+ "bertscore",
+ "comet"
+ ],
+ "configs": [
+ "covost2_ar_en",
+ "covost2_ca_en",
+ "covost2_cy_en",
+ "covost2_de_en",
+ "covost2_en_ar",
+ "covost2_en_ca",
+ "covost2_en_cy",
+ "covost2_en_de",
+ "covost2_en_et",
+ "covost2_en_fa",
+ "covost2_en_id",
+ "covost2_en_ja",
+ "covost2_en_lv",
+ "covost2_en_mn",
+ "covost2_en_sl",
+ "covost2_en_sv-SE",
+ "covost2_en_ta",
+ "covost2_en_tr",
+ "covost2_en_zh-CN",
+ "covost2_es_en",
+ "covost2_et_en",
+ "covost2_fa_en",
+ "covost2_fr_en",
+ "covost2_id_en",
+ "covost2_it_en",
+ "covost2_ja_en",
+ "covost2_lv_en",
+ "covost2_mn_en",
+ "covost2_nl_en",
+ "covost2_pt_en",
+ "covost2_ru_en",
+ "covost2_sl_en",
+ "covost2_sv-SE_en",
+ "covost2_ta_en",
+ "covost2_tr_en",
+ "covost2_zh-CN_en"
+ ],
+ "category": "spoken_language_understanding"
+ }
+};
diff --git a/ui/tasks.json b/ui/tasks.json
new file mode 100644
index 0000000..996b6f2
--- /dev/null
+++ b/ui/tasks.json
@@ -0,0 +1,1316 @@
+{
+ "audio_understanding": {
+ "name": "🔊 Audio Understanding",
+ "description": "Tasks that require understanding of the general audio signals including but not limited to music, noise, sound.",
+ "tasks": {
+ "music_understanding": {
+ "name": "Music Understanding",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mu_chomusic_test"
+ ]
+ },
+ "scene_understanding": {
+ "name": "Scene Understanding",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "audiocaps_qa_test",
+ "audiocaps_test",
+ "clotho_aqa_test",
+ "wavcaps_qa_test",
+ "wavcaps_test"
+ ]
+ }
+ }
+ },
+ "paralinguistics": {
+ "name": "🎭 Paralinguistics",
+ "description": "Tasks that analyze non-verbal aspects of speech such as emotion, gender, accent, and speaker characteristics.",
+ "tasks": {
+ "accent_recognition": {
+ "name": "Accent Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mnsc_pqa_ar_dialogue_test",
+ "mnsc_pqa_ar_sentence_test",
+ "voxceleb_accent_test"
+ ]
+ },
+ "emotion_recognition": {
+ "name": "Emotion Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_emotion_recognition",
+ "meld_emotion_test",
+ "meld_sentiment_test"
+ ]
+ },
+ "gender_recognition": {
+ "name": "Gender Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_gender_recognition",
+ "mnsc_pqa_gr_dialogue_test",
+ "mnsc_pqa_gr_sentence_test",
+ "voxceleb_gender_test"
+ ]
+ },
+ "speaker_diarization": {
+ "name": "Speaker Diarization",
+ "metrics": [
+ "diarization_metrics"
+ ],
+ "configs": [
+ "callhome_diarization_deu",
+ "callhome_diarization_eng",
+ "callhome_diarization_jpn",
+ "callhome_diarization_spa",
+ "callhome_diarization_zho"
+ ]
+ },
+ "speaker_recognition": {
+ "name": "Speaker Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mmau_mini"
+ ]
+ }
+ }
+ },
+ "phonetics": {
+ "name": "📢 Phonetics",
+ "description": "Tasks related to phonetic analysis, phoneme recognition, and speech sound processing.",
+ "tasks": {
+ "phonemes": {
+ "name": "Phonemes",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "voxangeles_phoneme_counting"
+ ]
+ }
+ }
+ },
+ "safety_and_security": {
+ "name": "🔐 Safety and Security",
+ "description": "Tasks related to assessing model behavior around safety, robustness, and vulnerability to spoofing or adversarial content.",
+ "tasks": {
+ "safety": {
+ "name": "Safety",
+ "metrics": [
+ "llm_judge_redteaming"
+ ],
+ "configs": [
+ "advbench"
+ ]
+ },
+ "spoofing": {
+ "name": "Spoofing",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "asvspoof"
+ ]
+ }
+ }
+ },
+ "speech_disorder": {
+ "name": "🩺 Speech Disorder",
+ "description": "Tasks related to detecting and analyzing speech disorders and voice pathologies.",
+ "tasks": {
+ "voice_disorder": {
+ "name": "Voice Disorder",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "stuttering_detection"
+ ]
+ }
+ }
+ },
+ "speech_enhancement": {
+ "name": "✨ Speech Enhancement",
+ "description": "Tasks related to speech quality improvement, noise detection, and audio enhancement.",
+ "tasks": {
+ "noise_detection": {
+ "name": "Noise Detection",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "noise_detection"
+ ]
+ }
+ }
+ },
+ "speech_recognition": {
+ "name": "🗣️ Speech Recognition",
+ "description": "Tasks involving automatic speech recognition (ASR), including standard ASR, long-form ASR, and code-switching ASR.",
+ "tasks": {
+ "asr": {
+ "name": "Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "aishell_1_test",
+ "ami_ihm",
+ "ami_sdm",
+ "callhome_asr_deu",
+ "callhome_asr_eng",
+ "callhome_asr_jpn",
+ "callhome_asr_spa",
+ "callhome_asr_zho",
+ "common_voice_15_ab",
+ "common_voice_15_af",
+ "common_voice_15_am",
+ "common_voice_15_ar",
+ "common_voice_15_as",
+ "common_voice_15_ast",
+ "common_voice_15_az",
+ "common_voice_15_ba",
+ "common_voice_15_bas",
+ "common_voice_15_be",
+ "common_voice_15_bg",
+ "common_voice_15_bn",
+ "common_voice_15_br",
+ "common_voice_15_ca",
+ "common_voice_15_ckb",
+ "common_voice_15_cnh",
+ "common_voice_15_cs",
+ "common_voice_15_cv",
+ "common_voice_15_cy",
+ "common_voice_15_da",
+ "common_voice_15_de",
+ "common_voice_15_dv",
+ "common_voice_15_dyu",
+ "common_voice_15_el",
+ "common_voice_15_en",
+ "common_voice_15_eo",
+ "common_voice_15_es",
+ "common_voice_15_et",
+ "common_voice_15_eu",
+ "common_voice_15_fa",
+ "common_voice_15_fi",
+ "common_voice_15_fr",
+ "common_voice_15_fy-NL",
+ "common_voice_15_ga-IE",
+ "common_voice_15_gl",
+ "common_voice_15_ha",
+ "common_voice_15_hi",
+ "common_voice_15_hsb",
+ "common_voice_15_hu",
+ "common_voice_15_hy-AM",
+ "common_voice_15_ia",
+ "common_voice_15_id",
+ "common_voice_15_ig",
+ "common_voice_15_it",
+ "common_voice_15_ja",
+ "common_voice_15_ka",
+ "common_voice_15_kab",
+ "common_voice_15_kk",
+ "common_voice_15_kmr",
+ "common_voice_15_ko",
+ "common_voice_15_ky",
+ "common_voice_15_lg",
+ "common_voice_15_lt",
+ "common_voice_15_lv",
+ "common_voice_15_mdf",
+ "common_voice_15_mg",
+ "common_voice_15_mk",
+ "common_voice_15_ml",
+ "common_voice_15_mn",
+ "common_voice_15_mr",
+ "common_voice_15_mt",
+ "common_voice_15_myv",
+ "common_voice_15_ne-NP",
+ "common_voice_15_nl",
+ "common_voice_15_nn-NO",
+ "common_voice_15_or",
+ "common_voice_15_pa-IN",
+ "common_voice_15_pl",
+ "common_voice_15_pt",
+ "common_voice_15_rm-sursilv",
+ "common_voice_15_rm-vallader",
+ "common_voice_15_ro",
+ "common_voice_15_ru",
+ "common_voice_15_rw",
+ "common_voice_15_sah",
+ "common_voice_15_sat",
+ "common_voice_15_sc",
+ "common_voice_15_sk",
+ "common_voice_15_sl",
+ "common_voice_15_sr",
+ "common_voice_15_sv-SE",
+ "common_voice_15_sw",
+ "common_voice_15_ta",
+ "common_voice_15_te",
+ "common_voice_15_tg",
+ "common_voice_15_th",
+ "common_voice_15_ti",
+ "common_voice_15_tok",
+ "common_voice_15_tr",
+ "common_voice_15_tt",
+ "common_voice_15_ug",
+ "common_voice_15_uk",
+ "common_voice_15_ur",
+ "common_voice_15_uz",
+ "common_voice_15_vot",
+ "common_voice_15_yi",
+ "common_voice_15_yue",
+ "common_voice_15_zh-CN",
+ "common_voice_15_zh-HK",
+ "common_voice_15_zh-TW",
+ "fleurs_af_za",
+ "fleurs_am_et",
+ "fleurs_ar_eg",
+ "fleurs_as_in",
+ "fleurs_ast_es",
+ "fleurs_az_az",
+ "fleurs_be_by",
+ "fleurs_bg_bg",
+ "fleurs_bn_in",
+ "fleurs_bs_ba",
+ "fleurs_ca_es",
+ "fleurs_ceb_ph",
+ "fleurs_ckb_iq",
+ "fleurs_cmn_hans_cn",
+ "fleurs_cs_cz",
+ "fleurs_cy_gb",
+ "fleurs_da_dk",
+ "fleurs_de_de",
+ "fleurs_el_gr",
+ "fleurs_en_us",
+ "fleurs_es_419",
+ "fleurs_et_ee",
+ "fleurs_fa_ir",
+ "fleurs_ff_sn",
+ "fleurs_fi_fi",
+ "fleurs_fil_ph",
+ "fleurs_fr_fr",
+ "fleurs_ga_ie",
+ "fleurs_gl_es",
+ "fleurs_gu_in",
+ "fleurs_ha_ng",
+ "fleurs_he_il",
+ "fleurs_hi_in",
+ "fleurs_hr_hr",
+ "fleurs_hu_hu",
+ "fleurs_hy_am",
+ "fleurs_id_id",
+ "fleurs_ig_ng",
+ "fleurs_is_is",
+ "fleurs_it_it",
+ "fleurs_ja_jp",
+ "fleurs_jv_id",
+ "fleurs_ka_ge",
+ "fleurs_kam_ke",
+ "fleurs_kea_cv",
+ "fleurs_kk_kz",
+ "fleurs_km_kh",
+ "fleurs_kn_in",
+ "fleurs_ko_kr",
+ "fleurs_ky_kg",
+ "fleurs_lb_lu",
+ "fleurs_lg_ug",
+ "fleurs_ln_cd",
+ "fleurs_lo_la",
+ "fleurs_lt_lt",
+ "fleurs_luo_ke",
+ "fleurs_lv_lv",
+ "fleurs_mi_nz",
+ "fleurs_mk_mk",
+ "fleurs_ml_in",
+ "fleurs_mn_mn",
+ "fleurs_mr_in",
+ "fleurs_ms_my",
+ "fleurs_mt_mt",
+ "fleurs_my_mm",
+ "fleurs_nb_no",
+ "fleurs_ne_np",
+ "fleurs_nl_nl",
+ "fleurs_nso_za",
+ "fleurs_ny_mw",
+ "fleurs_oc_fr",
+ "fleurs_om_et",
+ "fleurs_or_in",
+ "fleurs_pa_in",
+ "fleurs_pl_pl",
+ "fleurs_ps_af",
+ "fleurs_pt_br",
+ "fleurs_ro_ro",
+ "fleurs_ru_ru",
+ "fleurs_sd_in",
+ "fleurs_sk_sk",
+ "fleurs_sl_si",
+ "fleurs_sn_zw",
+ "fleurs_so_so",
+ "fleurs_sr_rs",
+ "fleurs_sv_se",
+ "fleurs_sw_ke",
+ "fleurs_ta_in",
+ "fleurs_te_in",
+ "fleurs_tg_tj",
+ "fleurs_th_th",
+ "fleurs_tr_tr",
+ "fleurs_uk_ua",
+ "fleurs_umb_ao",
+ "fleurs_ur_pk",
+ "fleurs_uz_uz",
+ "fleurs_vi_vn",
+ "fleurs_wo_sn",
+ "fleurs_xh_za",
+ "fleurs_yo_ng",
+ "fleurs_yue_hant_hk",
+ "fleurs_zu_za",
+ "gigaspeech_test",
+ "gigaspeech2_id_test",
+ "gigaspeech2_th_test",
+ "gigaspeech2_vi_test",
+ "librispeech_test_clean",
+ "librispeech_test_other",
+ "librispeech_multilingual_dutch",
+ "librispeech_multilingual_french",
+ "librispeech_multilingual_german",
+ "librispeech_multilingual_italian",
+ "librispeech_multilingual_polish",
+ "librispeech_multilingual_portuguese",
+ "librispeech_multilingual_spanish",
+ "mnsc_asr_part1_test",
+ "mnsc_asr_part2_test",
+ "mnsc_asr_part3_test",
+ "mnsc_asr_part4_test",
+ "mnsc_asr_part5_test",
+ "mnsc_asr_part6_test",
+ "peoples_speech_test",
+ "spgispeech_test",
+ "tedlium3_test",
+ "voxpopuli_cs",
+ "voxpopuli_de",
+ "voxpopuli_en",
+ "voxpopuli_en_accented",
+ "voxpopuli_es",
+ "voxpopuli_et",
+ "voxpopuli_fi",
+ "voxpopuli_fr",
+ "voxpopuli_hr",
+ "voxpopuli_hu",
+ "voxpopuli_it",
+ "voxpopuli_lt",
+ "voxpopuli_nl",
+ "voxpopuli_pl",
+ "voxpopuli_ro",
+ "voxpopuli_sk",
+ "voxpopuli_sl"
+ ]
+ },
+ "code_switching_asr": {
+ "name": "Code Switching Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "seame_dev_man",
+ "seame_dev_sge"
+ ]
+ },
+ "long_form_asr": {
+ "name": "Long Form Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "earnings21",
+ "earnings22",
+ "tedlium3_long_form"
+ ]
+ }
+ }
+ },
+ "spoken_language_reasoning": {
+ "name": "🧩 Spoken Language Reasoning",
+ "description": "Tasks that require reasoning over spoken input, such as instruction following or logical/mathematical reasoning.",
+ "tasks": {
+ "bfcl": {
+ "name": "Bfcl",
+ "metrics": [
+ "bfcl_match_score"
+ ],
+ "configs": [
+ "bfcl_audio_irrelevance",
+ "bfcl_audio_multiple",
+ "bfcl_audio_parallel",
+ "bfcl_audio_parallel_multiple",
+ "bfcl_audio_simple",
+ "bfcl_audio_irrelevance_no_prompt",
+ "bfcl_audio_multiple_no_prompt",
+ "bfcl_audio_parallel_multiple_no_prompt",
+ "bfcl_audio_parallel_no_prompt",
+ "bfcl_audio_simple_no_prompt",
+ "bfcl_text_irrelevance",
+ "bfcl_text_multiple",
+ "bfcl_text_parallel",
+ "bfcl_text_parallel_multiple",
+ "bfcl_text_simple",
+ "bfcl_text_irrelevance_no_prompt",
+ "bfcl_text_multiple_no_prompt",
+ "bfcl_text_parallel_multiple_no_prompt",
+ "bfcl_text_parallel_no_prompt",
+ "bfcl_text_simple_no_prompt"
+ ]
+ },
+ "gsm8k": {
+ "name": "Gsm8K",
+ "metrics": [
+ "gsm8k_exact_match"
+ ],
+ "configs": [
+ "gsm8k_audio",
+ "gsm8k_text"
+ ]
+ },
+ "ifeval": {
+ "name": "Ifeval",
+ "metrics": [
+ "instruction_following"
+ ],
+ "configs": [
+ "voicebench_ifeval_audio",
+ "voicebench_ifeval_text"
+ ]
+ },
+ "mtbench": {
+ "name": "Mtbench",
+ "metrics": [
+ "mt_bench_llm_judge"
+ ],
+ "configs": [
+ "mtbench_audio",
+ "mtbench_text"
+ ]
+ },
+ "speech_to_sql": {
+ "name": "Speech To Sql",
+ "metrics": [
+ "sql_score"
+ ],
+ "configs": [
+ "spider_audio",
+ "spider_text"
+ ]
+ }
+ }
+ },
+ "spoken_language_understanding": {
+ "name": "🧠 Spoken Language Understanding",
+ "description": "Tasks that require understanding of spoken language and/or audio information including QA, translation, summarization, and intent classification.",
+ "tasks": {
+ "intent_classification": {
+ "name": "Intent Classification",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "SLURP-intent"
+ ]
+ },
+ "speech_qa": {
+ "name": "Speech Qa",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "alpaca_audio_test",
+ "cn_college_listen_mcq_test",
+ "dream_tts_mcq_test",
+ "mnsc_sqa_part3_test",
+ "mnsc_sqa_part4_test",
+ "mnsc_sqa_part5_test",
+ "mnsc_sqa_part6_test",
+ "openhermes_instruction_test",
+ "public_sg_speech_qa_test",
+ "slue_p2_sqa5_test",
+ "spoken_squad_test"
+ ]
+ },
+ "spoken_dialogue": {
+ "name": "Spoken Dialogue",
+ "metrics": [
+ "joint_goal_accuracy",
+ "slot_accuracy",
+ "slot_f1",
+ "bleu"
+ ],
+ "configs": [
+ "spokenwoz_audio",
+ "spokenwoz_text"
+ ]
+ },
+ "spoken_dialogue_summarization": {
+ "name": "Spoken Dialogue Summarization",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "mnsc_sds_part3_test",
+ "mnsc_sds_part4_test",
+ "mnsc_sds_part5_test"
+ ]
+ },
+ "sqqa": {
+ "name": "Sqqa",
+ "metrics": [
+ "llm_judge_big_bench_audio"
+ ],
+ "configs": [
+ "big_bench_audio_audio_query",
+ "big_bench_audio_text_query",
+ "mmsu_biology",
+ "mmsu_business",
+ "mmsu_chemistry",
+ "mmsu_economics",
+ "mmsu_engineering",
+ "mmsu_health",
+ "mmsu_history",
+ "mmsu_law",
+ "mmsu_other",
+ "mmsu_philosophy",
+ "mmsu_physics",
+ "mmsu_psychology",
+ "openbookqa",
+ "sd-qa_aus_audio",
+ "sd-qa_aus_text",
+ "sd-qa_gbr_audio",
+ "sd-qa_gbr_text",
+ "sd-qa_ind_n_audio",
+ "sd-qa_ind_n_text",
+ "sd-qa_ind_s_audio",
+ "sd-qa_ind_s_text",
+ "sd-qa_irl_audio",
+ "sd-qa_irl_text",
+ "sd-qa_kenya_audio",
+ "sd-qa_kenya_text",
+ "sd-qa_nga_audio",
+ "sd-qa_nga_text",
+ "sd-qa_nzl_audio",
+ "sd-qa_nzl_text",
+ "sd-qa_phl_audio",
+ "sd-qa_phl_text",
+ "sd-qa_usa_audio",
+ "sd-qa_usa_text",
+ "sd-qa_zaf_audio",
+ "sd-qa_zaf_text"
+ ]
+ },
+ "translation": {
+ "name": "Translation",
+ "metrics": [
+ "bleu",
+ "meteor",
+ "bertscore",
+ "comet"
+ ],
+ "configs": [
+ "covost2_ar_en",
+ "covost2_ca_en",
+ "covost2_cy_en",
+ "covost2_de_en",
+ "covost2_en_ar",
+ "covost2_en_ca",
+ "covost2_en_cy",
+ "covost2_en_de",
+ "covost2_en_et",
+ "covost2_en_fa",
+ "covost2_en_id",
+ "covost2_en_ja",
+ "covost2_en_lv",
+ "covost2_en_mn",
+ "covost2_en_sl",
+ "covost2_en_sv-SE",
+ "covost2_en_ta",
+ "covost2_en_tr",
+ "covost2_en_zh-CN",
+ "covost2_es_en",
+ "covost2_et_en",
+ "covost2_fa_en",
+ "covost2_fr_en",
+ "covost2_id_en",
+ "covost2_it_en",
+ "covost2_ja_en",
+ "covost2_lv_en",
+ "covost2_mn_en",
+ "covost2_nl_en",
+ "covost2_pt_en",
+ "covost2_ru_en",
+ "covost2_sl_en",
+ "covost2_sv-SE_en",
+ "covost2_ta_en",
+ "covost2_tr_en",
+ "covost2_zh-CN_en"
+ ]
+ }
+ }
+ },
+ "music_understanding": {
+ "name": "Music Understanding",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mu_chomusic_test"
+ ],
+ "category": "audio_understanding"
+ },
+ "scene_understanding": {
+ "name": "Scene Understanding",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "audiocaps_qa_test",
+ "audiocaps_test",
+ "clotho_aqa_test",
+ "wavcaps_qa_test",
+ "wavcaps_test"
+ ],
+ "category": "audio_understanding"
+ },
+ "accent_recognition": {
+ "name": "Accent Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mnsc_pqa_ar_dialogue_test",
+ "mnsc_pqa_ar_sentence_test",
+ "voxceleb_accent_test"
+ ],
+ "category": "paralinguistics"
+ },
+ "emotion_recognition": {
+ "name": "Emotion Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_emotion_recognition",
+ "meld_emotion_test",
+ "meld_sentiment_test"
+ ],
+ "category": "paralinguistics"
+ },
+ "gender_recognition": {
+ "name": "Gender Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "iemocap_gender_recognition",
+ "mnsc_pqa_gr_dialogue_test",
+ "mnsc_pqa_gr_sentence_test",
+ "voxceleb_gender_test"
+ ],
+ "category": "paralinguistics"
+ },
+ "speaker_diarization": {
+ "name": "Speaker Diarization",
+ "metrics": [
+ "diarization_metrics"
+ ],
+ "configs": [
+ "callhome_diarization_deu",
+ "callhome_diarization_eng",
+ "callhome_diarization_jpn",
+ "callhome_diarization_spa",
+ "callhome_diarization_zho"
+ ],
+ "category": "paralinguistics"
+ },
+ "speaker_recognition": {
+ "name": "Speaker Recognition",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "mmau_mini"
+ ],
+ "category": "paralinguistics"
+ },
+ "phonemes": {
+ "name": "Phonemes",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "voxangeles_phoneme_counting"
+ ],
+ "category": "phonetics"
+ },
+ "safety": {
+ "name": "Safety",
+ "metrics": [
+ "llm_judge_redteaming"
+ ],
+ "configs": [
+ "advbench"
+ ],
+ "category": "safety_and_security"
+ },
+ "spoofing": {
+ "name": "Spoofing",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "asvspoof"
+ ],
+ "category": "safety_and_security"
+ },
+ "voice_disorder": {
+ "name": "Voice Disorder",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "stuttering_detection"
+ ],
+ "category": "speech_disorder"
+ },
+ "noise_detection": {
+ "name": "Noise Detection",
+ "metrics": [
+ "llm_judge_binary",
+ "detailed_judge_prompt"
+ ],
+ "configs": [
+ "noise_detection"
+ ],
+ "category": "speech_enhancement"
+ },
+ "asr": {
+ "name": "Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "aishell_1_test",
+ "ami_ihm",
+ "ami_sdm",
+ "callhome_asr_deu",
+ "callhome_asr_eng",
+ "callhome_asr_jpn",
+ "callhome_asr_spa",
+ "callhome_asr_zho",
+ "common_voice_15_ab",
+ "common_voice_15_af",
+ "common_voice_15_am",
+ "common_voice_15_ar",
+ "common_voice_15_as",
+ "common_voice_15_ast",
+ "common_voice_15_az",
+ "common_voice_15_ba",
+ "common_voice_15_bas",
+ "common_voice_15_be",
+ "common_voice_15_bg",
+ "common_voice_15_bn",
+ "common_voice_15_br",
+ "common_voice_15_ca",
+ "common_voice_15_ckb",
+ "common_voice_15_cnh",
+ "common_voice_15_cs",
+ "common_voice_15_cv",
+ "common_voice_15_cy",
+ "common_voice_15_da",
+ "common_voice_15_de",
+ "common_voice_15_dv",
+ "common_voice_15_dyu",
+ "common_voice_15_el",
+ "common_voice_15_en",
+ "common_voice_15_eo",
+ "common_voice_15_es",
+ "common_voice_15_et",
+ "common_voice_15_eu",
+ "common_voice_15_fa",
+ "common_voice_15_fi",
+ "common_voice_15_fr",
+ "common_voice_15_fy-NL",
+ "common_voice_15_ga-IE",
+ "common_voice_15_gl",
+ "common_voice_15_ha",
+ "common_voice_15_hi",
+ "common_voice_15_hsb",
+ "common_voice_15_hu",
+ "common_voice_15_hy-AM",
+ "common_voice_15_ia",
+ "common_voice_15_id",
+ "common_voice_15_ig",
+ "common_voice_15_it",
+ "common_voice_15_ja",
+ "common_voice_15_ka",
+ "common_voice_15_kab",
+ "common_voice_15_kk",
+ "common_voice_15_kmr",
+ "common_voice_15_ko",
+ "common_voice_15_ky",
+ "common_voice_15_lg",
+ "common_voice_15_lt",
+ "common_voice_15_lv",
+ "common_voice_15_mdf",
+ "common_voice_15_mg",
+ "common_voice_15_mk",
+ "common_voice_15_ml",
+ "common_voice_15_mn",
+ "common_voice_15_mr",
+ "common_voice_15_mt",
+ "common_voice_15_myv",
+ "common_voice_15_ne-NP",
+ "common_voice_15_nl",
+ "common_voice_15_nn-NO",
+ "common_voice_15_or",
+ "common_voice_15_pa-IN",
+ "common_voice_15_pl",
+ "common_voice_15_pt",
+ "common_voice_15_rm-sursilv",
+ "common_voice_15_rm-vallader",
+ "common_voice_15_ro",
+ "common_voice_15_ru",
+ "common_voice_15_rw",
+ "common_voice_15_sah",
+ "common_voice_15_sat",
+ "common_voice_15_sc",
+ "common_voice_15_sk",
+ "common_voice_15_sl",
+ "common_voice_15_sr",
+ "common_voice_15_sv-SE",
+ "common_voice_15_sw",
+ "common_voice_15_ta",
+ "common_voice_15_te",
+ "common_voice_15_tg",
+ "common_voice_15_th",
+ "common_voice_15_ti",
+ "common_voice_15_tok",
+ "common_voice_15_tr",
+ "common_voice_15_tt",
+ "common_voice_15_ug",
+ "common_voice_15_uk",
+ "common_voice_15_ur",
+ "common_voice_15_uz",
+ "common_voice_15_vot",
+ "common_voice_15_yi",
+ "common_voice_15_yue",
+ "common_voice_15_zh-CN",
+ "common_voice_15_zh-HK",
+ "common_voice_15_zh-TW",
+ "fleurs_af_za",
+ "fleurs_am_et",
+ "fleurs_ar_eg",
+ "fleurs_as_in",
+ "fleurs_ast_es",
+ "fleurs_az_az",
+ "fleurs_be_by",
+ "fleurs_bg_bg",
+ "fleurs_bn_in",
+ "fleurs_bs_ba",
+ "fleurs_ca_es",
+ "fleurs_ceb_ph",
+ "fleurs_ckb_iq",
+ "fleurs_cmn_hans_cn",
+ "fleurs_cs_cz",
+ "fleurs_cy_gb",
+ "fleurs_da_dk",
+ "fleurs_de_de",
+ "fleurs_el_gr",
+ "fleurs_en_us",
+ "fleurs_es_419",
+ "fleurs_et_ee",
+ "fleurs_fa_ir",
+ "fleurs_ff_sn",
+ "fleurs_fi_fi",
+ "fleurs_fil_ph",
+ "fleurs_fr_fr",
+ "fleurs_ga_ie",
+ "fleurs_gl_es",
+ "fleurs_gu_in",
+ "fleurs_ha_ng",
+ "fleurs_he_il",
+ "fleurs_hi_in",
+ "fleurs_hr_hr",
+ "fleurs_hu_hu",
+ "fleurs_hy_am",
+ "fleurs_id_id",
+ "fleurs_ig_ng",
+ "fleurs_is_is",
+ "fleurs_it_it",
+ "fleurs_ja_jp",
+ "fleurs_jv_id",
+ "fleurs_ka_ge",
+ "fleurs_kam_ke",
+ "fleurs_kea_cv",
+ "fleurs_kk_kz",
+ "fleurs_km_kh",
+ "fleurs_kn_in",
+ "fleurs_ko_kr",
+ "fleurs_ky_kg",
+ "fleurs_lb_lu",
+ "fleurs_lg_ug",
+ "fleurs_ln_cd",
+ "fleurs_lo_la",
+ "fleurs_lt_lt",
+ "fleurs_luo_ke",
+ "fleurs_lv_lv",
+ "fleurs_mi_nz",
+ "fleurs_mk_mk",
+ "fleurs_ml_in",
+ "fleurs_mn_mn",
+ "fleurs_mr_in",
+ "fleurs_ms_my",
+ "fleurs_mt_mt",
+ "fleurs_my_mm",
+ "fleurs_nb_no",
+ "fleurs_ne_np",
+ "fleurs_nl_nl",
+ "fleurs_nso_za",
+ "fleurs_ny_mw",
+ "fleurs_oc_fr",
+ "fleurs_om_et",
+ "fleurs_or_in",
+ "fleurs_pa_in",
+ "fleurs_pl_pl",
+ "fleurs_ps_af",
+ "fleurs_pt_br",
+ "fleurs_ro_ro",
+ "fleurs_ru_ru",
+ "fleurs_sd_in",
+ "fleurs_sk_sk",
+ "fleurs_sl_si",
+ "fleurs_sn_zw",
+ "fleurs_so_so",
+ "fleurs_sr_rs",
+ "fleurs_sv_se",
+ "fleurs_sw_ke",
+ "fleurs_ta_in",
+ "fleurs_te_in",
+ "fleurs_tg_tj",
+ "fleurs_th_th",
+ "fleurs_tr_tr",
+ "fleurs_uk_ua",
+ "fleurs_umb_ao",
+ "fleurs_ur_pk",
+ "fleurs_uz_uz",
+ "fleurs_vi_vn",
+ "fleurs_wo_sn",
+ "fleurs_xh_za",
+ "fleurs_yo_ng",
+ "fleurs_yue_hant_hk",
+ "fleurs_zu_za",
+ "gigaspeech_test",
+ "gigaspeech2_id_test",
+ "gigaspeech2_th_test",
+ "gigaspeech2_vi_test",
+ "librispeech_test_clean",
+ "librispeech_test_other",
+ "librispeech_multilingual_dutch",
+ "librispeech_multilingual_french",
+ "librispeech_multilingual_german",
+ "librispeech_multilingual_italian",
+ "librispeech_multilingual_polish",
+ "librispeech_multilingual_portuguese",
+ "librispeech_multilingual_spanish",
+ "mnsc_asr_part1_test",
+ "mnsc_asr_part2_test",
+ "mnsc_asr_part3_test",
+ "mnsc_asr_part4_test",
+ "mnsc_asr_part5_test",
+ "mnsc_asr_part6_test",
+ "peoples_speech_test",
+ "spgispeech_test",
+ "tedlium3_test",
+ "voxpopuli_cs",
+ "voxpopuli_de",
+ "voxpopuli_en",
+ "voxpopuli_en_accented",
+ "voxpopuli_es",
+ "voxpopuli_et",
+ "voxpopuli_fi",
+ "voxpopuli_fr",
+ "voxpopuli_hr",
+ "voxpopuli_hu",
+ "voxpopuli_it",
+ "voxpopuli_lt",
+ "voxpopuli_nl",
+ "voxpopuli_pl",
+ "voxpopuli_ro",
+ "voxpopuli_sk",
+ "voxpopuli_sl"
+ ],
+ "category": "speech_recognition"
+ },
+ "code_switching_asr": {
+ "name": "Code Switching Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "seame_dev_man",
+ "seame_dev_sge"
+ ],
+ "category": "speech_recognition"
+ },
+ "long_form_asr": {
+ "name": "Long Form Asr",
+ "metrics": [
+ "word_error_rate"
+ ],
+ "configs": [
+ "earnings21",
+ "earnings22",
+ "tedlium3_long_form"
+ ],
+ "category": "speech_recognition"
+ },
+ "bfcl": {
+ "name": "Bfcl",
+ "metrics": [
+ "bfcl_match_score"
+ ],
+ "configs": [
+ "bfcl_audio_irrelevance",
+ "bfcl_audio_multiple",
+ "bfcl_audio_parallel",
+ "bfcl_audio_parallel_multiple",
+ "bfcl_audio_simple",
+ "bfcl_audio_irrelevance_no_prompt",
+ "bfcl_audio_multiple_no_prompt",
+ "bfcl_audio_parallel_multiple_no_prompt",
+ "bfcl_audio_parallel_no_prompt",
+ "bfcl_audio_simple_no_prompt",
+ "bfcl_text_irrelevance",
+ "bfcl_text_multiple",
+ "bfcl_text_parallel",
+ "bfcl_text_parallel_multiple",
+ "bfcl_text_simple",
+ "bfcl_text_irrelevance_no_prompt",
+ "bfcl_text_multiple_no_prompt",
+ "bfcl_text_parallel_multiple_no_prompt",
+ "bfcl_text_parallel_no_prompt",
+ "bfcl_text_simple_no_prompt"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "gsm8k": {
+ "name": "Gsm8K",
+ "metrics": [
+ "gsm8k_exact_match"
+ ],
+ "configs": [
+ "gsm8k_audio",
+ "gsm8k_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "ifeval": {
+ "name": "Ifeval",
+ "metrics": [
+ "instruction_following"
+ ],
+ "configs": [
+ "voicebench_ifeval_audio",
+ "voicebench_ifeval_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "mtbench": {
+ "name": "Mtbench",
+ "metrics": [
+ "mt_bench_llm_judge"
+ ],
+ "configs": [
+ "mtbench_audio",
+ "mtbench_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "speech_to_sql": {
+ "name": "Speech To Sql",
+ "metrics": [
+ "sql_score"
+ ],
+ "configs": [
+ "spider_audio",
+ "spider_text"
+ ],
+ "category": "spoken_language_reasoning"
+ },
+ "intent_classification": {
+ "name": "Intent Classification",
+ "metrics": [
+ "llm_judge_binary"
+ ],
+ "configs": [
+ "SLURP-intent"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "speech_qa": {
+ "name": "Speech Qa",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "alpaca_audio_test",
+ "cn_college_listen_mcq_test",
+ "dream_tts_mcq_test",
+ "mnsc_sqa_part3_test",
+ "mnsc_sqa_part4_test",
+ "mnsc_sqa_part5_test",
+ "mnsc_sqa_part6_test",
+ "openhermes_instruction_test",
+ "public_sg_speech_qa_test",
+ "slue_p2_sqa5_test",
+ "spoken_squad_test"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "spoken_dialogue": {
+ "name": "Spoken Dialogue",
+ "metrics": [
+ "joint_goal_accuracy",
+ "slot_accuracy",
+ "slot_f1",
+ "bleu"
+ ],
+ "configs": [
+ "spokenwoz_audio",
+ "spokenwoz_text"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "spoken_dialogue_summarization": {
+ "name": "Spoken Dialogue Summarization",
+ "metrics": [
+ "llm_judge_detailed"
+ ],
+ "configs": [
+ "mnsc_sds_part3_test",
+ "mnsc_sds_part4_test",
+ "mnsc_sds_part5_test"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "sqqa": {
+ "name": "Sqqa",
+ "metrics": [
+ "llm_judge_big_bench_audio"
+ ],
+ "configs": [
+ "big_bench_audio_audio_query",
+ "big_bench_audio_text_query",
+ "mmsu_biology",
+ "mmsu_business",
+ "mmsu_chemistry",
+ "mmsu_economics",
+ "mmsu_engineering",
+ "mmsu_health",
+ "mmsu_history",
+ "mmsu_law",
+ "mmsu_other",
+ "mmsu_philosophy",
+ "mmsu_physics",
+ "mmsu_psychology",
+ "openbookqa",
+ "sd-qa_aus_audio",
+ "sd-qa_aus_text",
+ "sd-qa_gbr_audio",
+ "sd-qa_gbr_text",
+ "sd-qa_ind_n_audio",
+ "sd-qa_ind_n_text",
+ "sd-qa_ind_s_audio",
+ "sd-qa_ind_s_text",
+ "sd-qa_irl_audio",
+ "sd-qa_irl_text",
+ "sd-qa_kenya_audio",
+ "sd-qa_kenya_text",
+ "sd-qa_nga_audio",
+ "sd-qa_nga_text",
+ "sd-qa_nzl_audio",
+ "sd-qa_nzl_text",
+ "sd-qa_phl_audio",
+ "sd-qa_phl_text",
+ "sd-qa_usa_audio",
+ "sd-qa_usa_text",
+ "sd-qa_zaf_audio",
+ "sd-qa_zaf_text"
+ ],
+ "category": "spoken_language_understanding"
+ },
+ "translation": {
+ "name": "Translation",
+ "metrics": [
+ "bleu",
+ "meteor",
+ "bertscore",
+ "comet"
+ ],
+ "configs": [
+ "covost2_ar_en",
+ "covost2_ca_en",
+ "covost2_cy_en",
+ "covost2_de_en",
+ "covost2_en_ar",
+ "covost2_en_ca",
+ "covost2_en_cy",
+ "covost2_en_de",
+ "covost2_en_et",
+ "covost2_en_fa",
+ "covost2_en_id",
+ "covost2_en_ja",
+ "covost2_en_lv",
+ "covost2_en_mn",
+ "covost2_en_sl",
+ "covost2_en_sv-SE",
+ "covost2_en_ta",
+ "covost2_en_tr",
+ "covost2_en_zh-CN",
+ "covost2_es_en",
+ "covost2_et_en",
+ "covost2_fa_en",
+ "covost2_fr_en",
+ "covost2_id_en",
+ "covost2_it_en",
+ "covost2_ja_en",
+ "covost2_lv_en",
+ "covost2_mn_en",
+ "covost2_nl_en",
+ "covost2_pt_en",
+ "covost2_ru_en",
+ "covost2_sl_en",
+ "covost2_sv-SE_en",
+ "covost2_ta_en",
+ "covost2_tr_en",
+ "covost2_zh-CN_en"
+ ],
+ "category": "spoken_language_understanding"
+ }
+}
\ No newline at end of file