diff --git a/README.md b/README.md index fe1d7fd..42f2e41 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,26 @@ bash evaluate.sh Results will be generated in `run_logs/` with detailed metrics and analysis. +## 🖥️ Configuration UI Tool + +AU-Harness includes a web-based configuration UI tool to help users easily create and customize evaluation configurations without manually editing YAML files. + +### Features: +- Interactive task selection from all supported categories (Speech Recognition, Paralinguistics, Audio Understanding, etc.) +- Model configuration with preset templates for common models like GPT-4o-mini and Gemini +- Advanced options for filtering, judge settings, and generation parameters +- Copy to clipboard or download functionality + +### Usage: +1. Navigate to the `ui/` directory +2. Open `index.html` in your web browser +3. Select the tasks you want to evaluate from the categorized task list +4. Configure your models by adding model endpoints, API keys, and parameters +5. Adjust advanced options like sample limits, language filters, and judge settings +6. Generate the YAML configuration, then copy or download it for use with `evaluate.sh` + +This tool simplifies the process of setting up complex evaluation runs by providing a user-friendly interface to build `config.yaml` files, making it easier to get started with AU-Harness evaluations. + ## 💻 Usage AU-Harness requires setting up a running configuration file (`config.yaml`) to define your evaluation parameters. This file controls which models, datasets, and metrics are used in your evaluation. diff --git a/ui/README.md b/ui/README.md new file mode 100644 index 0000000..24c8d32 --- /dev/null +++ b/ui/README.md @@ -0,0 +1,135 @@ +# AU-Harness UI Tool + +A user-friendly web interface for configuring and running audio model evaluations with the AU-Harness framework. + +## 🚀 Quick Start + +1. **Open the UI**: Simply open `index.html` in your web browser +2. **Select Tasks**: Browse task categories and select specific tasks with their metrics +3. **Configure Models**: Choose from preset models or configure custom endpoints +4. **Generate Config**: Preview and download the generated YAML configuration + +## 📋 Features + +### Task Selection +- **Visual Category Navigation**: 6 task categories with clear descriptions +- **Smart Metric Filtering**: Automatically shows supported metrics for each task +- **Multi-Selection Support**: Select multiple tasks across different categories +- **Real-time Feedback**: Visual indicators show selected tasks and metrics + +### Model Configuration +- **Preset Models**: Quick setup for common models (GPT-4o, Gemini, Qwen) +- **Custom Model Support**: Add any OpenAI-compatible endpoint +- **Sharding Configuration**: Automatic model instance management +- **Connection Validation**: Built-in endpoint testing + +### Advanced Options +- **Dataset Filtering**: Control sample limits, duration ranges, and language +- **Judge Settings**: Configure LLM judges for evaluation +- **Generation Parameters**: Override model parameters per task +- **Prompt Customization**: Modify system and user prompts + +### Configuration Management +- **YAML Preview**: See generated configuration +- **Export Options**: Download as YAML file or copy to clipboard + +## 🛠️ Technical Details + +### Architecture +- **Frontend**: Vanilla HTML5, CSS3, JavaScript (ES6+) +- **No Dependencies**: Completely self-contained, no npm packages required +- **Responsive Design**: Works on desktop, tablet, and mobile +- **Modern CSS**: CSS Grid, Flexbox, Custom Properties +- **Accessibility**: WCAG 2.1 compliant with semantic HTML + +### File Structure +``` +ui/ +├── index.html # Main application page with HTML comments for sections +├── styles.css # Complete styling with CSS custom properties and section comments +├── app.js # Application logic with detailed function comments +├── generate_tasks.py # Script to generate tasks.js and tasks.json with docstrings and comments +├── tasks.js # Task categories and metrics data +├── tasks.json # Task categories and metrics data +└── README.md # This documentation +``` + +### Browser Support +- Chrome 90+ +- Firefox 88+ +- Safari 14+ +- Edge 90+ + +## 📖 Usage Guide + +### 1. Selecting Tasks +1. Click on any category card to expand it +2. Check the boxes next to desired tasks +3. View selected metrics in the "Selected Tasks" section +4. Remove tasks by clicking the "Remove" button + +### 2. Configuring Models +1. Choose "Preset Models" for quick setup +2. Check boxes next to desired models +3. Or switch to "Custom Model" tab for custom endpoints +4. Fill in model name, endpoint, and API key + +### 3. Advanced Configuration +1. Set sample limits to control evaluation size +2. Adjust duration filters for audio length constraints +3. Select target language for evaluation +4. Configure additional options as needed + +### 4. Generating Configuration +1. Click "Generate Config" to create YAML +2. Review the generated configuration in the preview +3. Click "Download YAML" to save the file +4. Use the config with AU-Harness evaluation engine + +### 5. Running Evaluations +1. Click "Run Evaluation" to start the process +2. Monitor progress in the Results Dashboard +3. View scores and metrics as they complete +4. Export results for further analysis + + +## 🚀 Integration with AU-Harness + +The generated YAML configuration is fully compatible with the AU-Harness evaluation engine. Use it as follows: + +```bash +# Using the generated config +python evaluate.py --config your-config.yaml + +# Or with the UI-generated file +python evaluate.py --config au-harness-config.yaml +``` + + +## 🆘 Troubleshooting + +### Common Issues + +**Q: Configuration preview is empty** +A: Make sure you've selected at least one task and one model before generating the config. + +**Q: Download doesn't work** +A: Check your browser's download settings and ensure pop-ups are allowed for this site. + +**Q: Styling looks broken** +A: Ensure you're opening `index.html` directly in a browser, not through a file:// path with restrictions. + +### Performance Tips + +- For large evaluations, consider reducing sample limits initially +- Use preset models for faster setup +- Clear browser cache if experiencing issues with updates + +## 📞 Support + +For issues with the UI tool, please check: +1. Browser console for JavaScript errors +2. Network tab for any failed resource loads +3. This documentation for usage guidance + +For issues with the AU-Harness framework itself, please refer to the main project documentation. diff --git a/ui/app.js b/ui/app.js new file mode 100644 index 0000000..2abcf93 --- /dev/null +++ b/ui/app.js @@ -0,0 +1,965 @@ +// AU-Harness Configuration UI - Main Application Logic +// This file handles the dynamic loading of tasks, model configuration, +// and YAML generation for the AU-Harness evaluation framework. + +// Task categories and tasks data +let taskCategories = {}; +let taskConfigs = {}; + +// Theme management +function toggleTheme() { + const currentTheme = document.documentElement.getAttribute('data-theme'); + const newTheme = currentTheme === 'dark' ? 'light' : 'dark'; + + document.documentElement.setAttribute('data-theme', newTheme); + localStorage.setItem('theme', newTheme); + + // Update toggle button + const icon = document.getElementById('theme-icon'); + const text = document.getElementById('theme-text'); + + if (newTheme === 'dark') { + icon.textContent = '☀️'; + text.textContent = 'Light Mode'; + } else { + icon.textContent = '🌙'; + text.textContent = 'Dark Mode'; + } +} + +// Load saved theme on page load +function loadTheme() { + const savedTheme = localStorage.getItem('theme') || 'light'; + document.documentElement.setAttribute('data-theme', savedTheme); + + // Update toggle button + const icon = document.getElementById('theme-icon'); + const text = document.getElementById('theme-text'); + + if (savedTheme === 'dark') { + icon.textContent = '☀️'; + text.textContent = 'Light Mode'; + } else { + icon.textContent = '🌙'; + text.textContent = 'Dark Mode'; + } +} + +function formatDisplayLabel(key) { + return key + .replace(/_/g, ' ') + .replace(/\b\w/g, char => char.toUpperCase()); +} + +function formatConfigLabel(key) { + // Keep config names as-is (don't format them) + return key; +} + +function sanitizeId(value) { + return value.replace(/[^a-zA-Z0-9_-]/g, '-'); +} + +// Load task categories from tasks.js (loaded via script tag) +function loadTaskCategories() { + try { + // Check if TASKS_DATA is available (loaded from tasks.js) + if (typeof window.TASKS_DATA === 'undefined') { + throw new Error('TASKS_DATA not found. Please regenerate tasks.js by running generate_tasks.py'); + } + + const data = window.TASKS_DATA; + taskCategories = {}; + taskConfigs = {}; + + Object.entries(data).forEach(([key, value]) => { + if (value && typeof value === 'object' && value.tasks) { + taskCategories[key] = value; + } else if (value && typeof value === 'object' && value.category) { + taskConfigs[key] = value; + } + }); + } catch (error) { + console.error('Failed to load tasks data:', error); + alert('Failed to load tasks metadata. Please regenerate tasks.js via generate_tasks.py.'); + throw error; + } +} + +// Preset models configuration +// These are common model configurations that users can quickly load +const presetModels = { + "gpt-4o-mini": { + name: "gpt-4o-mini-audio-preview", + inference_type: "openai", + url: "${AZURE_ENDPOINT_URL}", + auth_token: "${AZURE_AUTH_TOKEN}", + api_version: "2025-01-01-preview", + delay: 100, + retry_attempts: 10, + timeout: 60, + batch_size: 300, + chunk_size: 30 + }, + "gemini-2.5-flash": { + name: "gemini-2.5-flash", + inference_type: "gemini", + location: "${GOOGLE_CLOUD_LOCATION}", + project_id: "${GOOGLE_CLOUD_PROJECT}", + model: "google/gemini-2.5-flash", + reasoning_effort: "medium", + delay: 150, + retry_attempts: 5, + timeout: 300, + batch_size: 100, + chunk_size: 30240 + }, + "qwen-2.5-omni": { + name: "qwen-2.5-omni", + inference_type: "vllm", + url: "${VLLM_ENDPOINT_URL}", + auth_token: "${VLLM_AUTH_TOKEN}", + delay: 180, + retry_attempts: 8, + timeout: 120, + batch_size: 50, + chunk_size: 40 + } +}; + +// Application state +// Holds the current configuration of selected tasks and models +const state = { + selectedTasks: [], + models: [], + advancedOptions: { + sample_limit: 500, + min_duration: 1.0, + max_duration: 60.0, + language: "en", + accented: false, + metric_aggregation: "average", + judge_api_version: "", + judge_prompt_model_override: "", + judge_model: "gpt-4o-mini", + judge_type: "openai", + judge_api_endpoint: "${ENDPOINT_URL}", + judge_api_key: "${AUTH_TOKEN}", + judge_concurrency: 16, + judge_temperature: 0.0, + generation_params_override: "", + prompt_overrides: "" + } +}; + +let modelCount = 1; + +function addSelectedTask(entry) { + const exists = state.selectedTasks.some( + task => + task.identifier === entry.identifier && + task.metric === entry.metric && + task.category === entry.category + ); + if (!exists) { + state.selectedTasks.push(entry); + } +} + +function removeSelectedTask(predicate) { + state.selectedTasks = state.selectedTasks.filter(task => !predicate(task)); +} + +function updateCategoryCardState(categoryCard) { + if (!categoryCard) return; + const hasCheckedTask = categoryCard.querySelector('.task-checkbox:checked'); + const hasCheckedConfig = categoryCard.querySelector('.task-config-checkbox:checked'); + if (hasCheckedTask || hasCheckedConfig) { + categoryCard.classList.add('selected'); + } else { + categoryCard.classList.remove('selected'); + } +} + +// Initialize the application +// Called when the DOM is fully loaded +document.addEventListener('DOMContentLoaded', function() { + loadTheme(); + loadTaskCategories(); + initializeTaskCategories(); + initializeTaskSelectionControls(); + initializeModelConfiguration(); + initializeAdvancedOptions(); + initializePreviewActions(); +}); + +// Initialize model configuration +// Sets up event listeners for adding models and loading examples +function initializeModelConfiguration() { + // Add model button + document.getElementById('add-model-btn').addEventListener('click', addNewModel); + + // Load example button + document.getElementById('load-example-btn').addEventListener('click', loadExampleConfig); +} + +// Add new model configuration +// Creates a new model form section dynamically +function addNewModel() { + const container = document.getElementById('models-container'); + const modelIndex = modelCount++; + + const modelDiv = document.createElement('div'); + modelDiv.className = 'model-config-item'; + modelDiv.dataset.modelIndex = modelIndex; + + modelDiv.innerHTML = ` +

Model ${modelIndex + 1}

+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+ Advanced Config +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + + `; + + container.appendChild(modelDiv); + updateRemoveButtons(); +} + +// Remove model configuration +function removeModel(modelIndex) { + const modelDiv = document.querySelector(`[data-model-index="${modelIndex}"]`); + if (modelDiv) { + modelDiv.remove(); + updateRemoveButtons(); + } +} + +// Update remove buttons visibility +// Hides remove button when only one model remains +function updateRemoveButtons() { + const models = document.querySelectorAll('.model-config-item'); + document.querySelectorAll('.remove-model-btn').forEach(btn => { + btn.style.display = models.length > 1 ? 'block' : 'none'; + }); +} + +// Load example configuration +// Fills the first model with a sample configuration +function loadExampleConfig() { + const exampleConfig = { + name: "gpt-4o-mini-audio-preview", + displayName: "gpt-4o-mini-audio-preview-1", + inferenceType: "openai", + endpoint: "https://your-endpoint.openai.azure.com", + apiKey: "your-api-key-here", + authToken: "${AZURE_AUTH_TOKEN}", + apiVersion: "2025-01-01-preview", + location: "", + projectId: "", + delay: 100, + retry: 10, + timeout: 60, + batchSize: 300, + chunkSize: 30, + reasoningEffort: "" + }; + + // Fill the first model with example data + const firstModel = document.querySelector('.model-config-item'); + if (firstModel) { + firstModel.querySelector('.model-name').value = exampleConfig.name; + firstModel.querySelector('.model-display-name').value = exampleConfig.displayName; + firstModel.querySelector('.model-inference-type').value = exampleConfig.inferenceType; + firstModel.querySelector('.model-endpoint').value = exampleConfig.endpoint; + firstModel.querySelector('.model-api-key').value = exampleConfig.apiKey; + firstModel.querySelector('.model-auth-token').value = exampleConfig.authToken; + firstModel.querySelector('.model-api-version').value = exampleConfig.apiVersion; + firstModel.querySelector('.model-location').value = exampleConfig.location; + firstModel.querySelector('.model-project-id').value = exampleConfig.projectId; + firstModel.querySelector('.model-delay').value = exampleConfig.delay; + firstModel.querySelector('.model-retry').value = exampleConfig.retry; + firstModel.querySelector('.model-timeout').value = exampleConfig.timeout; + firstModel.querySelector('.model-batch-size').value = exampleConfig.batchSize; + firstModel.querySelector('.model-chunk-size').value = exampleConfig.chunkSize; + firstModel.querySelector('.model-reasoning-effort').value = exampleConfig.reasoningEffort; + } +} + +// Collect model configurations +// Gathers all model data from the form into an array +function collectModelConfigurations() { + const models = []; + document.querySelectorAll('.model-config-item').forEach(modelDiv => { + const model = { + name: modelDiv.querySelector('.model-name').value.trim(), + displayName: modelDiv.querySelector('.model-display-name').value.trim(), + inferenceType: modelDiv.querySelector('.model-inference-type').value, + endpoint: modelDiv.querySelector('.model-endpoint').value.trim(), + apiKey: modelDiv.querySelector('.model-api-key').value.trim(), + authToken: modelDiv.querySelector('.model-auth-token').value.trim() || undefined, + apiVersion: modelDiv.querySelector('.model-api-version').value.trim() || undefined, + location: modelDiv.querySelector('.model-location').value.trim() || undefined, + projectId: modelDiv.querySelector('.model-project-id').value.trim() || undefined, + delay: parseInt(modelDiv.querySelector('.model-delay').value) || 100, + retry: parseInt(modelDiv.querySelector('.model-retry').value) || 8, + timeout: parseInt(modelDiv.querySelector('.model-timeout').value) || 30, + batchSize: parseInt(modelDiv.querySelector('.model-batch-size').value) || 1, + chunkSize: parseInt(modelDiv.querySelector('.model-chunk-size').value) || 30, + reasoningEffort: modelDiv.querySelector('.model-reasoning-effort').value || undefined + }; + + // Only add if required fields are filled + if (model.name && model.displayName && model.inferenceType && model.endpoint && model.apiKey) { + models.push(model); + } + }); + + return models; +} + +// Initialize task selection UI +// Populates the task categories and their tasks +function initializeTaskCategories() { + const container = document.getElementById('task-categories'); + container.innerHTML = ''; + + Object.entries(taskCategories).forEach(([categoryKey, category]) => { + const categoryCard = document.createElement('div'); + categoryCard.className = 'category-card expanded'; + categoryCard.dataset.category = categoryKey; + + const tasksMarkup = Object.entries(category.tasks).map(([taskKey, task]) => { + const configsForTask = task.configs || []; + const hasConfigs = configsForTask.length > 0; + + if (hasConfigs) { + // Smart expand: auto-expand only if exactly 1 config + const shouldAutoExpand = configsForTask.length === 1; + const expandedClass = shouldAutoExpand ? 'expanded' : ''; + + // Create config options with metrics displayed next to each config + const configOptions = configsForTask.map(configKey => { + const configId = `config-${sanitizeId(`${categoryKey}-${taskKey}-${configKey}`)}`; + const metricsMarkup = task.metrics && task.metrics.length > 0 + ? `${task.metrics.join(', ')}` + : ''; + return ` +
+ + +
+ `; + }).join(''); + + // Show bulk action buttons only if more than 1 config + const bulkActionsMarkup = configsForTask.length > 1 ? ` +
+ + +
+ ` : ''; + + return ` +
+
+
+ ${task.name} + ${configsForTask.length} config${configsForTask.length !== 1 ? 's' : ''} +
+ ${shouldAutoExpand ? '▼' : '▶'} +
+
+ ${bulkActionsMarkup} +
+ ${configOptions} +
+
+
+ `; + } + + const checkboxId = `task-${sanitizeId(`${categoryKey}-${taskKey}`)}`; + return ` +
+ + +
+ `; + }).join(''); + + categoryCard.innerHTML = ` +

${category.name}

+

${category.description}

+
+ ${tasksMarkup} +
+ `; + + container.appendChild(categoryCard); + }); + + document.querySelectorAll('.task-checkbox').forEach(checkbox => { + checkbox.addEventListener('change', handleTaskSelection); + }); + document.querySelectorAll('.task-config-checkbox').forEach(checkbox => { + checkbox.addEventListener('change', handleConfigSelection); + }); +} + +// Handle task selection +// Updates the state when a task checkbox is toggled +function handleTaskSelection(event) { + const checkbox = event.target; + const categoryKey = checkbox.dataset.category; + const taskKey = checkbox.dataset.task; + const category = taskCategories[categoryKey]; + const task = category?.tasks?.[taskKey]; + + if (!task) return; + + if (checkbox.checked) { + task.metrics.forEach(metric => { + addSelectedTask({ + category: categoryKey, + task: taskKey, + taskName: task.name, + config: null, + configName: null, + metric, + identifier: taskKey + }); + }); + } else { + removeSelectedTask( + t => t.category === categoryKey && t.task === taskKey && t.identifier === taskKey + ); + } + + updateCategoryCardState(checkbox.closest('.category-card')); +} + +function handleConfigSelection(event) { + const checkbox = event.target; + const { category: categoryKey, task: taskKey, config: configKey } = checkbox.dataset; + const category = taskCategories[categoryKey]; + const task = category?.tasks?.[taskKey]; + + if (!task) return; + + if (checkbox.checked) { + task.metrics.forEach(metric => { + addSelectedTask({ + category: categoryKey, + task: taskKey, + taskName: task.name, + config: configKey, + configName: formatConfigLabel(configKey), + metric, + identifier: configKey + }); + }); + } else { + removeSelectedTask( + t => t.category === categoryKey && t.task === taskKey && t.identifier === configKey + ); + } + + updateCategoryCardState(checkbox.closest('.category-card')); +} + +// Initialize advanced options +// Binds form inputs to state variables +function initializeAdvancedOptions() { + const inputs = { + 'sample-limit': 'sample_limit', + 'min-duration': 'min_duration', + 'max-duration': 'max_duration', + 'language': 'language', + 'accented': 'accented', + 'metric-aggregation': 'metric_aggregation', + 'judge-api-version': 'judge_api_version', + 'judge-prompt-model-override': 'judge_prompt_model_override', + 'judge-model': 'judge_model', + 'judge-type': 'judge_type', + 'judge-api-endpoint': 'judge_api_endpoint', + 'judge-api-key': 'judge_api_key', + 'judge-concurrency': 'judge_concurrency', + 'judge-temperature': 'judge_temperature', + 'generation-params-override': 'generation_params_override', + 'prompt-overrides': 'prompt_overrides' + }; + + Object.entries(inputs).forEach(([id, stateKey]) => { + const element = document.getElementById(id); + if (element) { + element.addEventListener('change', function() { + if (element.type === 'checkbox') { + state.advancedOptions[stateKey] = element.checked; + } else { + state.advancedOptions[stateKey] = element.value; + } + }); + // Set initial value + if (element.type === 'checkbox') { + element.checked = state.advancedOptions[stateKey]; + } else { + element.value = state.advancedOptions[stateKey]; + } + } + }); +} + +// Initialize preview actions +// Sets up event listeners for config generation, copy, and download +function initializePreviewActions() { + document.getElementById('generate-config').addEventListener('click', generateConfig); + document.getElementById('copy-config').addEventListener('click', copyConfig); + document.getElementById('download-config').addEventListener('click', downloadConfig); +} + +// Generate configuration +// Creates the YAML config from current state +function generateConfig() { + const models = collectModelConfigurations(); + + if (state.selectedTasks.length === 0) { + alert('Please select at least one task'); + return; + } + + if (models.length === 0) { + alert('Please configure at least one model'); + return; + } + + // Generate timestamp in YYYYMMDD_HHMMSS format + const now = new Date(); + const timestamp = now.getFullYear() + + String(now.getMonth() + 1).padStart(2, '0') + + String(now.getDate()).padStart(2, '0') + '_' + + String(now.getHours()).padStart(2, '0') + + String(now.getMinutes()).padStart(2, '0') + + String(now.getSeconds()).padStart(2, '0'); + + // Generate aggregate from selected tasks/configs + const metricGroups = {}; + state.selectedTasks.forEach(task => { + const targetKey = task.identifier || task.task; + if (!metricGroups[task.metric]) metricGroups[task.metric] = []; + if (!metricGroups[task.metric].includes(targetKey)) { + metricGroups[task.metric].push(targetKey); + } + }); + const aggregate = Object.entries(metricGroups).filter(([metric, tasks]) => tasks.length > 1).map(([metric, tasks]) => [metric, tasks]); + + const config = { + task_metric: state.selectedTasks.map(task => [ + task.identifier || task.task, + task.metric + ]) + }; + + // Add aggregate if there are grouped metrics + if (aggregate.length > 0) { + config.aggregate = aggregate; + } + + config.filter = { + num_samples: state.advancedOptions.sample_limit, + length_filter: [state.advancedOptions.min_duration, state.advancedOptions.max_duration], + language: state.advancedOptions.language, + accented: state.advancedOptions.accented + }; + + config.judge_settings = { + judge_model: state.advancedOptions.judge_model, + judge_type: state.advancedOptions.judge_type, + judge_api_endpoint: state.advancedOptions.judge_api_endpoint, + judge_api_key: state.advancedOptions.judge_api_key, + judge_concurrency: state.advancedOptions.judge_concurrency, + judge_temperature: state.advancedOptions.judge_temperature, + ...(state.advancedOptions.judge_api_version && { judge_api_version: state.advancedOptions.judge_api_version }), + ...(state.advancedOptions.judge_prompt_model_override && { judge_prompt_model_override: state.advancedOptions.judge_prompt_model_override }) + }; + + config.logging = { + log_file: `run_${timestamp}.log` + }; + + // Add generation params override if provided + if (state.advancedOptions.generation_params_override.trim()) { + config.generation_params_override = state.advancedOptions.generation_params_override.trim(); + } + + // Add prompt overrides if provided + if (state.advancedOptions.prompt_overrides.trim()) { + config.prompt_overrides = state.advancedOptions.prompt_overrides.trim(); + } + + config.models = models.map((model, index) => { + const modelConfig = { + name: model.displayName, + inference_type: model.inferenceType, + url: model.endpoint, + model: model.name, + auth_token: model.authToken || model.apiKey, + delay: model.delay, + retry_attempts: model.retry, + timeout: model.timeout, + batch_size: model.batchSize, + chunk_size: model.chunkSize + }; + + // Add optional fields only if they have values + if (model.apiVersion) modelConfig.api_version = model.apiVersion; + if (model.location) modelConfig.location = model.location; + if (model.projectId) modelConfig.project_id = model.projectId; + if (model.reasoningEffort) modelConfig.reasoning_effort = model.reasoningEffort; + + return modelConfig; + }); + + // Add metric_aggregation + if (state.advancedOptions.metric_aggregation !== "average") { + config.metric_aggregation = state.advancedOptions.metric_aggregation; + } + + const yaml = generateYAML(config); + document.getElementById('config-preview').textContent = yaml; +} + +// Generate YAML from config object +// Recursively generates YAML string from a given object +function generateYAML(obj, indent = 0) { + const spaces = ' '.repeat(indent); + let yaml = ''; + + if (Array.isArray(obj)) { + obj.forEach(item => { + if (typeof item === 'object') { + yaml += `${spaces}-\n`; + yaml += generateYAML(item, indent + 1); + } else { + yaml += `${spaces}- ${item}\n`; + } + }); + } else if (typeof obj === 'object' && obj !== null) { + Object.entries(obj).forEach(([key, value]) => { + if (typeof value === 'object' && value !== null && !Array.isArray(value)) { + yaml += `${spaces}${key}:\n`; + yaml += generateYAML(value, indent + 1); + } else if (Array.isArray(value)) { + yaml += `${spaces}${key}:\n`; + yaml += generateYAML(value, indent + 1); + } else { + yaml += `${spaces}${key}: ${value}\n`; + } + }); + } + + return yaml; +} + +// Download configuration +// Saves the generated YAML config to a file +function downloadConfig() { + const config = document.getElementById('config-preview').textContent; + + // Generate timestamp in YYYYMMDD_HHMMSS format + const now = new Date(); + const timestamp = now.getFullYear() + + String(now.getMonth() + 1).padStart(2, '0') + + String(now.getDate()).padStart(2, '0') + '_' + + String(now.getHours()).padStart(2, '0') + + String(now.getMinutes()).padStart(2, '0') + + String(now.getSeconds()).padStart(2, '0'); + + const filename = `au-harness-config-${timestamp}.yaml`; + + const blob = new Blob([config], { type: 'text/yaml' }); + const url = URL.createObjectURL(blob); + + const a = document.createElement('a'); + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); +} + +// Copy configuration to clipboard +// Copies the generated YAML config to the clipboard +async function copyConfig() { + const config = document.getElementById('config-preview').textContent; + const button = document.getElementById('copy-config'); + const originalText = button.textContent; + + try { + if (navigator.clipboard && window.isSecureContext) { + // Use the Clipboard API when available + await navigator.clipboard.writeText(config); + } else { + // Fallback for older browsers or non-HTTPS contexts + const textArea = document.createElement('textarea'); + textArea.value = config; + textArea.style.position = 'fixed'; + textArea.style.left = '-999999px'; + textArea.style.top = '-999999px'; + document.body.appendChild(textArea); + textArea.focus(); + textArea.select(); + + try { + document.execCommand('copy'); + } finally { + document.body.removeChild(textArea); + } + } + + // Visual feedback + button.textContent = '✅ Copied!'; + button.style.background = 'var(--success-color)'; + + setTimeout(() => { + button.textContent = originalText; + button.style.background = ''; + }, 2000); + + } catch (err) { + console.error('Failed to copy: ', err); + button.textContent = '❌ Failed'; + button.style.background = 'var(--error-color)'; + + setTimeout(() => { + button.textContent = originalText; + button.style.background = ''; + }, 2000); + } +} + +// Initialize task selection controls +// Sets up event listeners for select all and reset buttons +function initializeTaskSelectionControls() { + document.getElementById('select-all-tasks').addEventListener('click', selectAllTasks); + document.getElementById('reset-selection').addEventListener('click', resetSelection); +} + +// Select all tasks +// Toggles all task checkboxes to checked +function selectAllTasks() { + const taskCheckboxes = document.querySelectorAll('.task-checkbox'); + taskCheckboxes.forEach(checkbox => { + if (!checkbox.checked) { + checkbox.checked = true; + checkbox.dispatchEvent(new Event('change')); + } + }); + + const configCheckboxes = document.querySelectorAll('.task-config-checkbox'); + configCheckboxes.forEach(checkbox => { + if (!checkbox.checked) { + checkbox.checked = true; + checkbox.dispatchEvent(new Event('change')); + } + }); +} + +// Reset selection +// Toggles all task checkboxes to unchecked +function resetSelection() { + const taskCheckboxes = document.querySelectorAll('.task-checkbox'); + taskCheckboxes.forEach(checkbox => { + if (checkbox.checked) { + checkbox.checked = false; + checkbox.dispatchEvent(new Event('change')); + } + }); + + const configCheckboxes = document.querySelectorAll('.task-config-checkbox'); + configCheckboxes.forEach(checkbox => { + if (checkbox.checked) { + checkbox.checked = false; + checkbox.dispatchEvent(new Event('change')); + } + }); +} + +// Toggle task configs visibility +function toggleTaskConfigs(headerElement) { + const taskItem = headerElement.closest('.task-item'); + const icon = headerElement.querySelector('.expand-icon'); + + taskItem.classList.toggle('expanded'); + icon.textContent = taskItem.classList.contains('expanded') ? '▼' : '▶'; +} + +// Select all configs for a specific task +function selectAllConfigsForTask(categoryKey, taskKey, event) { + event.stopPropagation(); + const checkboxes = document.querySelectorAll( + `.task-config-checkbox[data-category="${categoryKey}"][data-task="${taskKey}"]` + ); + checkboxes.forEach(checkbox => { + if (!checkbox.checked) { + checkbox.checked = true; + checkbox.dispatchEvent(new Event('change')); + } + }); +} + +// Deselect all configs for a specific task +function deselectAllConfigsForTask(categoryKey, taskKey, event) { + event.stopPropagation(); + const checkboxes = document.querySelectorAll( + `.task-config-checkbox[data-category="${categoryKey}"][data-task="${taskKey}"]` + ); + checkboxes.forEach(checkbox => { + if (checkbox.checked) { + checkbox.checked = false; + checkbox.dispatchEvent(new Event('change')); + } + }); +} + +// Filter configs by pattern +function filterByPattern(pattern) { + const allConfigItems = document.querySelectorAll('.config-item'); + let matchCount = 0; + + allConfigItems.forEach(item => { + const label = item.querySelector('label'); + if (label && label.textContent.toLowerCase().includes(pattern.toLowerCase())) { + item.style.display = 'flex'; + matchCount++; + } else { + item.style.display = 'none'; + } + }); + + // Auto-expand tasks that have matching configs + document.querySelectorAll('.task-item.has-configs').forEach(taskItem => { + const visibleConfigs = taskItem.querySelectorAll('.config-item[style*="flex"]'); + if (visibleConfigs.length > 0) { + taskItem.classList.add('expanded'); + const icon = taskItem.querySelector('.expand-icon'); + if (icon) icon.textContent = '▼'; + } + }); + + // Show feedback + if (matchCount === 0) { + alert(`No configs found matching "${pattern}"`); + } +} + +// Clear filter +function clearFilter() { + const allConfigItems = document.querySelectorAll('.config-item'); + allConfigItems.forEach(item => { + item.style.display = 'flex'; + }); + + // Restore default expand/collapse state (only 1-config tasks expanded) + document.querySelectorAll('.task-item.has-configs').forEach(taskItem => { + const configCount = taskItem.querySelectorAll('.config-item').length; + const shouldExpand = configCount === 1; + + if (shouldExpand) { + taskItem.classList.add('expanded'); + } else { + taskItem.classList.remove('expanded'); + } + + const icon = taskItem.querySelector('.expand-icon'); + if (icon) icon.textContent = shouldExpand ? '▼' : '▶'; + }); +} diff --git a/ui/generate_tasks.py b/ui/generate_tasks.py new file mode 100644 index 0000000..702b0ae --- /dev/null +++ b/ui/generate_tasks.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Script to dynamically generate tasks.json from the tasks folder structure. +Loads metrics directly from the YAML files under the tasks directory instead of parsing README tables. +""" + +import json +from pathlib import Path +from typing import Dict, List + +import yaml + + +def get_category_description(category): + """Return a human-friendly description for a task category.""" + descriptions = { + "speech_recognition": "Tasks involving automatic speech recognition (ASR), including standard ASR, long-form ASR, and code-switching ASR.", + "paralinguistics": "Tasks that analyze non-verbal aspects of speech such as emotion, gender, accent, and speaker characteristics.", + "audio_understanding": "Tasks that require understanding of the general audio signals including but not limited to music, noise, sound.", + "spoken_language_understanding": "Tasks that require understanding of spoken language and/or audio information including QA, translation, summarization, and intent classification.", + "spoken_language_reasoning": "Tasks that require reasoning over spoken input, such as instruction following or logical/mathematical reasoning.", + "safety_and_security": "Tasks related to assessing model behavior around safety, robustness, and vulnerability to spoofing or adversarial content.", + "speech_enhancement": "Tasks related to speech quality improvement, noise detection, and audio enhancement.", + "speech_disorder": "Tasks related to detecting and analyzing speech disorders and voice pathologies.", + "phonetics": "Tasks related to phonetic analysis, phoneme recognition, and speech sound processing." + } + + return descriptions.get(category, f"Tasks related to {category.replace('_', ' ')}.") + + +def get_category_display_name(category): + """Get display name with emoji for category.""" + display_names = { + "speech_recognition": "🗣️ Speech Recognition", + "paralinguistics": "🎭 Paralinguistics", + "audio_understanding": "🔊 Audio Understanding", + "spoken_language_understanding": "🧠 Spoken Language Understanding", + "spoken_language_reasoning": "🧩 Spoken Language Reasoning", + "safety_and_security": "🔐 Safety and Security", + "speech_enhancement": "✨ Speech Enhancement", + "speech_disorder": "🩺 Speech Disorder", + "phonetics": "📢 Phonetics" + } + + return display_names.get(category, category.replace('_', ' ').title()) + + +def format_task_name(task_name): + """Format task name for display.""" + return task_name.replace('_', ' ').title() + + +def safe_load_yaml(yaml_path: Path) -> Dict: + """Safely load YAML content, returning an empty dict on failure.""" + try: + with open(yaml_path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except yaml.YAMLError as exc: + print(f"Warning: Failed to parse YAML file {yaml_path}: {exc}") + except OSError as exc: + print(f"Warning: Failed to read YAML file {yaml_path}: {exc}") + return {} + + +def extract_metrics_from_yaml(data: Dict) -> List[str]: + """Extract a list of metric names from a YAML object.""" + metrics = [] + yaml_metrics = data.get("metrics") if isinstance(data, dict) else None + + if isinstance(yaml_metrics, list): + for item in yaml_metrics: + if isinstance(item, dict) and "metric" in item: + metrics.append(item["metric"]) + elif isinstance(item, str): + metrics.append(item) + + # Deduplicate while preserving order + seen = set() + unique_metrics = [] + for metric in metrics: + if metric and metric not in seen: + seen.add(metric) + unique_metrics.append(metric) + + return unique_metrics + + +def collect_metrics_from_task_dir(task_dir: Path) -> List[str]: + """ + Collect metrics for a task directory by inspecting its YAML files. + Preference is given to base.yaml if present; otherwise all YAML files under the directory + are scanned until metrics are found. + """ + yaml_files = [] + base_yaml = task_dir / "base.yaml" + + if base_yaml.exists(): + yaml_files.append(base_yaml) + + for yaml_path in sorted(task_dir.rglob("*.yaml")): + if yaml_path == base_yaml: + continue + yaml_files.append(yaml_path) + + for yaml_file in yaml_files: + data = safe_load_yaml(yaml_file) + metrics = extract_metrics_from_yaml(data) + if metrics: + return metrics + + return [] + + +def collect_configs_from_task_dir(task_dir: Path) -> List[str]: + """Collect config identifiers from YAML files excluding base definitions.""" + configs = [] + + for yaml_path in sorted(task_dir.rglob("*.yaml")): + # Skip base.yaml files at any level + if yaml_path.name.lower() == "base.yaml": + continue + + data = safe_load_yaml(yaml_path) + config_name = data.get("task_name") or yaml_path.stem + if config_name and config_name not in configs: + configs.append(config_name) + + return configs + + +def load_task_categories_from_yaml(tasks_dir: Path): + """ + Build the task categories dictionary by traversing the tasks directory. + Each top-level directory is treated as a category and each immediate + sub-directory is treated as a task whose metrics/configs are discovered from YAML. + Returns both the categories dictionary and a flat task metadata dictionary. + """ + task_categories = {} + task_details = {} + + for category_path in sorted(tasks_dir.iterdir(), key=lambda p: p.name): + if not category_path.is_dir(): + continue + + category_key = category_path.name + tasks = {} + for task_path in sorted(category_path.iterdir(), key=lambda p: p.name): + if not task_path.is_dir(): + continue + + metrics = collect_metrics_from_task_dir(task_path) + if not metrics: + continue + + task_key = task_path.name + + # Collect configs for this task + configs = collect_configs_from_task_dir(task_path) + + task_info = { + "name": format_task_name(task_key), + "metrics": metrics + } + + # Add configs to task_info if available + if configs: + task_info["configs"] = configs + + tasks[task_key] = task_info + + task_metadata = dict(task_info) + task_metadata["category"] = category_key + + task_details[task_key] = task_metadata + + if not tasks: + continue + + task_categories[category_key] = { + "name": get_category_display_name(category_key), + "description": get_category_description(category_key), + "tasks": tasks + } + + return task_categories, task_details + + +def main(): + """Main function to generate tasks.json and tasks.js from tasks folder.""" + + script_dir = Path(__file__).parent + project_root = script_dir.parent + tasks_dir = project_root / "tasks" + + if not tasks_dir.exists(): + print(f"Error: Tasks directory not found at {tasks_dir}") + return + + task_categories, task_details = load_task_categories_from_yaml(tasks_dir) + + if not task_categories: + print("No task categories were discovered from YAML files.") + return + + combined_output = {**task_categories, **task_details} + + # Generate tasks.json + json_output_file = script_dir / "tasks.json" + with open(json_output_file, 'w', encoding='utf-8') as f: + json.dump(combined_output, f, indent=2, ensure_ascii=False) + + # Generate tasks.js for local file access (no server needed) + js_output_file = script_dir / "tasks.js" + with open(js_output_file, 'w', encoding='utf-8') as f: + f.write("// Auto-generated by generate_tasks.py\n") + f.write("// This file allows the UI to work when opening index.html directly in a browser\n") + f.write("window.TASKS_DATA = ") + json.dump(combined_output, f, indent=2, ensure_ascii=False) + f.write(";\n") + + print(f"Successfully generated {json_output_file} and {js_output_file} with {len(task_categories)} task categories") + + +if __name__ == "__main__": + main() diff --git a/ui/index.html b/ui/index.html new file mode 100644 index 0000000..cd38b13 --- /dev/null +++ b/ui/index.html @@ -0,0 +1,266 @@ + + + + + + AU-Harness Configuration UI + + + +
+
+ +

🎯 AU-Harness Configuration

+

Configure and run audio model evaluations

+
+ +
+ +
+

📊 Task Selection

+

Select tasks for evaluation

+ +
+ + +
+ + +
+

Quick Filters:

+
+ + + + + + + + +
+
+ +
+ +
+
+ + +
+

🤖 Model Configuration

+

Configure one or more models for evaluation. Add multiple models to compare performance.

+ +
+
+

Model 1

+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+ Advanced Config +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +
+
+ +
+ + +
+
+ + +
+

⚖️ Judge Configuration

+

Configure the judge model for LLM-based evaluation metrics. Required when selecting tasks with judge metrics.

+ +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +
+

⚙️ Advanced Configuration

+
+ Evaluation Settings +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+
+
+ Prompt Updates +
+ + +
+
+ + +
+
+
+ + +
+

📋 Configuration Preview

+
+ + + +
+
# Configuration will appear here...
+
+ +
+
+ + + + + diff --git a/ui/styles.css b/ui/styles.css new file mode 100644 index 0000000..c83ead1 --- /dev/null +++ b/ui/styles.css @@ -0,0 +1,919 @@ +/* CSS Reset */ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +/* CSS Variables */ +:root { + --primary-color: #2563eb; + --secondary-color: #64748b; + --success-color: #16a34a; + --warning-color: #d97706; + --error-color: #dc2626; + --background: #f8fafc; + --card-bg: #ffffff; + --text-primary: #1e293b; + --text-secondary: #64748b; + --border-color: #e2e8f0; + --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1); + --shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.1); + --header-gradient-start: #2563eb; + --header-gradient-end: #3b82f6; + --task-header-bg: #fafbfc; + --task-header-hover: #f1f5f9; + --filter-bg: #f8fafc; +} + +/* Dark Mode Variables */ +[data-theme="dark"] { + --primary-color: #3b82f6; + --secondary-color: #94a3b8; + --success-color: #22c55e; + --warning-color: #f59e0b; + --error-color: #ef4444; + --background: #0f172a; + --card-bg: #1e293b; + --text-primary: #f1f5f9; + --text-secondary: #94a3b8; + --border-color: #334155; + --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.5); + --shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.5); + --header-gradient-start: #1e3a8a; + --header-gradient-end: #1e40af; + --task-header-bg: #334155; + --task-header-hover: #475569; + --filter-bg: #334155; +} + +/* Base Styles */ +body { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + background: var(--background); + color: var(--text-primary); + line-height: 1.6; +} + +/* Layout */ +.container { + max-width: 1200px; + margin: 0 auto; + padding: 20px; +} + +header { + text-align: center; + margin-bottom: 40px; + padding: 40px 0; + background: linear-gradient(135deg, var(--header-gradient-start), var(--header-gradient-end)); + color: white; + border-radius: 12px; + position: relative; +} + +header h1 { + font-size: 2.5rem; + margin-bottom: 10px; +} + +header p { + font-size: 1.1rem; + opacity: 0.9; +} + +/* Dark Mode Toggle */ +.theme-toggle { + position: absolute; + top: 20px; + right: 20px; + background: rgba(255, 255, 255, 0.2); + border: 1px solid rgba(255, 255, 255, 0.3); + color: white; + padding: 8px 16px; + border-radius: 20px; + cursor: pointer; + font-size: 0.9rem; + transition: all 0.3s; + display: flex; + align-items: center; + gap: 6px; +} + +.theme-toggle:hover { + background: rgba(255, 255, 255, 0.3); + transform: scale(1.05); +} + +/* Cards */ +.card { + background: var(--card-bg); + border-radius: 12px; + padding: 24px; + margin-bottom: 24px; + box-shadow: var(--shadow); + border: 1px solid var(--border-color); +} + +.card h2 { + font-size: 1.5rem; + margin-bottom: 20px; + color: var(--text-primary); + display: flex; + align-items: center; + gap: 8px; +} + +/* Task Categories */ +.categories-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); + gap: 24px; + margin-bottom: 24px; +} + +.category-card { + border: 2px solid var(--border-color); + border-radius: 8px; + padding: 16px; + cursor: pointer; + transition: all 0.2s; +} + +.category-card:hover { + border-color: var(--primary-color); + box-shadow: var(--shadow); +} + +.category-card.selected { + border-color: var(--primary-color); + background: #eff6ff; +} + +.category-card h3 { + font-size: 1.2rem; + margin-bottom: 8px; + color: var(--primary-color); +} + +.category-card p { + color: var(--text-secondary); + font-size: 0.9rem; + margin-bottom: 12px; +} + +.tasks-list { + display: none; + margin-top: 12px; +} + +.category-card.expanded .tasks-list { + display: flex; + flex-direction: column; + gap: 12px; +} + +.task-item { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 8px; + padding: 8px; + border-radius: 4px; + transition: background 0.2s; +} + +.task-item:hover { + background: var(--background); +} + +.task-item input[type="checkbox"] { + margin: 0; +} + +.task-item label { + flex: 1; + cursor: pointer; + font-size: 0.9rem; +} + +.task-item .task-title { + font-weight: 600; + display: inline-block; + margin-bottom: 4px; +} + +.task-item.has-configs { + flex-direction: column; + align-items: flex-start; + border: 1px solid var(--border-color); + padding: 0; + margin-bottom: 12px; + min-width: 0; + overflow: hidden; +} + +.task-header { + width: 100%; + display: flex; + justify-content: space-between; + align-items: center; + padding: 12px; + cursor: pointer; + background: var(--task-header-bg); + transition: background 0.2s; +} + +.task-header:hover { + background: var(--task-header-hover); +} + +.task-item .task-info { + display: flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; + min-width: 0; + flex: 1; +} + +.config-count-badge { + background: var(--primary-color); + color: white; + padding: 2px 8px; + border-radius: 12px; + font-size: 0.75rem; + font-weight: 600; +} + +.expand-icon { + font-size: 0.9rem; + color: var(--text-secondary); + user-select: none; +} + +.config-options { + width: 100%; + padding: 12px; + border-top: 1px solid var(--border-color); + display: none; + flex-direction: column; + gap: 12px; + min-width: 0; + box-sizing: border-box; +} + +.task-item.has-configs.expanded .config-options { + display: flex; +} + +.config-actions { + display: flex; + gap: 8px; +} + +.config-actions button { + padding: 4px 12px; + font-size: 0.85rem; + border: 1px solid var(--primary-color); + background: white; + color: var(--primary-color); + border-radius: 4px; + cursor: pointer; + transition: all 0.2s; + font-weight: 500; +} + +.config-actions button:hover { + background: var(--primary-color); + color: white; + border-color: var(--primary-color); +} + +.config-list { + display: flex; + flex-wrap: wrap; + gap: 8px 16px; + max-height: 400px; + overflow-y: auto; + overflow-x: hidden; + padding: 8px 0; + min-width: 0; + width: 100%; +} + +.config-label { + width: 100%; + font-size: 0.85rem; + font-weight: 600; + color: var(--text-secondary); +} + +.config-item { + display: flex; + align-items: center; + gap: 6px; + font-size: 0.9rem; + min-width: 0; +} + +.config-item label { + word-break: break-word; + overflow-wrap: break-word; + display: flex; + align-items: center; + gap: 8px; + flex: 1; +} + +.config-name { + font-weight: 500; +} + +.config-metrics { + font-size: 0.75rem; + color: var(--text-secondary); + font-style: italic; +} + +.config-item input[type="checkbox"] { + margin: 0; + flex-shrink: 0; +} + +.metrics-chips { + display: flex; + gap: 4px; + flex-wrap: wrap; + margin-top: 4px; +} + +.metric-chip { + background: var(--secondary-color); + color: white; + padding: 2px 8px; + border-radius: 12px; + font-size: 0.75rem; +} + +/* Quick Filters */ +.quick-filters { + margin: 16px 0; + padding: 16px; + background: var(--filter-bg); + border-radius: 8px; + border: 1px solid var(--border-color); +} + +.quick-filters h4 { + margin: 0 0 12px 0; + font-size: 0.9rem; + color: var(--text-secondary); +} + +.filter-buttons { + display: flex; + flex-wrap: wrap; + gap: 8px; +} + +.filter-btn { + padding: 6px 14px; + font-size: 0.85rem; + border: 1px solid var(--primary-color); + background: white; + color: var(--primary-color); + border-radius: 6px; + cursor: pointer; + transition: all 0.2s; + font-weight: 500; +} + +.filter-btn:hover { + background: var(--primary-color); + color: white; + border-color: var(--primary-color); +} + +.filter-btn.secondary { + background: var(--secondary-color); + color: white; + border-color: var(--secondary-color); +} + +.filter-btn.secondary:hover { + background: #475569; +} + +.selected-tasks { + border-top: 1px solid var(--border-color); + padding-top: 20px; +} + +.selected-task-item { + display: flex; + justify-content: space-between; + align-items: center; + padding: 8px 12px; + background: var(--background); + border-radius: 6px; + margin-bottom: 8px; +} + +.selected-task-item button { + background: var(--error-color); + color: white; + border: none; + padding: 4px 8px; + border-radius: 4px; + cursor: pointer; + font-size: 0.8rem; +} + +.task-selection-controls { + display: flex; + gap: 12px; + margin-bottom: 20px; + justify-content: center; +} + +/* Model Configuration */ +.model-config-item { + border: 2px solid var(--border-color); + border-radius: 12px; + padding: 20px; + margin-bottom: 20px; + background: #fafbfc; +} + +.model-config-item h3 { + margin-bottom: 16px; + color: var(--primary-color); + font-size: 1.2rem; +} + +.model-actions { + display: flex; + gap: 12px; + margin-top: 20px; + padding-top: 20px; + border-top: 1px solid var(--border-color); +} + +.remove-model-btn { + background: var(--error-color); + color: white; + border: none; + padding: 8px 16px; + border-radius: 6px; + cursor: pointer; + font-size: 0.9rem; + margin-top: 16px; +} + +.remove-model-btn:hover { + background: #b91c1c; +} + +#load-example-btn { + background: var(--secondary-color); +} + +#load-example-btn:hover { + background: #475569; +} + +/* Form Elements */ +.form-group { + margin-bottom: 16px; +} + +.form-group label { + display: block; + margin-bottom: 6px; + font-weight: 500; + color: var(--text-primary); +} + +.required-star { + color: #FF5C72; + font-weight: 500; +} + +.form-group input, +.form-group select { + width: 100%; + padding: 10px 12px; + border: 1px solid var(--border-color); + border-radius: 6px; + font-size: 0.95rem; + transition: border-color 0.2s; +} + +.form-group input:focus, +.form-group select:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1); +} + +.form-group textarea { + width: 100%; + padding: 10px 12px; + border: 1px solid var(--border-color); + border-radius: 6px; + font-size: 0.95rem; + font-family: inherit; + resize: vertical; + min-height: 100px; +} + +.form-group textarea:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1); +} + +.form-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 16px; +} + +/* Buttons */ +button { + background: var(--primary-color); + color: white; + border: none; + padding: 10px 16px; + border-radius: 6px; + cursor: pointer; + font-size: 0.95rem; + font-weight: 500; + transition: all 0.2s; +} + +button:hover { + background: #1d4ed8; + transform: translateY(-1px); +} + +button:active { + transform: translateY(0); +} + +button.secondary { + background: var(--secondary-color); +} + +#copy-config { + margin-left: auto; +} + +button.success { + background: var(--success-color); +} + +button.success:hover { + background: #15803d; +} + +/* Configuration Preview */ +#config-preview { + background: #1e293b; + color: #e2e8f0; + padding: 20px; + border-radius: 8px; + font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-size: 0.9rem; + line-height: 1.5; + overflow-x: auto; + white-space: pre-wrap; + max-height: 400px; + overflow-y: auto; +} + +.preview-actions { + display: flex; + gap: 12px; + margin-bottom: 20px; + flex-wrap: wrap; +} + +@media (max-width: 768px) { + .container { + padding: 12px; + } + + header { + padding: 24px 16px; + margin-bottom: 24px; + } + + header h1 { + font-size: 2rem; + } + + header p { + font-size: 1rem; + } + + .card { + padding: 16px; + margin-bottom: 16px; + } + + .card h2 { + font-size: 1.3rem; + margin-bottom: 16px; + } + + .categories-grid, + .model-grid { + grid-template-columns: 1fr; + gap: 12px; + } + + .category-card { + padding: 12px; + } + + .task-options .tasks-list { + grid-template-columns: 1fr; + gap: 8px; + } + + .form-grid { + grid-template-columns: 1fr; + gap: 12px; + } + + .form-group { + margin-bottom: 12px; + } + + .preview-actions { + flex-direction: column; + gap: 8px; + } + + .preview-actions button { + width: 100%; + padding: 12px; + } + + button { + width: 100%; + padding: 12px; + font-size: 1rem; + } + + .model-actions { + flex-direction: column; + gap: 8px; + } + + .model-config-item { + padding: 16px; + } + + .selected-task-item { + flex-direction: column; + align-items: flex-start; + gap: 8px; + } + + .selected-task-item button { + align-self: flex-end; + } + + .results-controls { + flex-direction: column; + gap: 8px; + } + + .results-controls button { + width: 100%; + } + + .results-grid { + grid-template-columns: 1fr; + } + + #config-preview { + font-size: 0.8rem; + padding: 12px; + max-height: 300px; + } +} + +/* Tablet styles */ +@media (max-width: 1024px) and (min-width: 769px) { + .container { + padding: 16px; + max-width: 100%; + } + + .form-grid { + grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + gap: 14px; + } + + .categories-grid { + grid-template-columns: repeat(auto-fit, minmax(350px, 1fr)); + } + + .task-options .tasks-list { + grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); + } + + .results-grid { + grid-template-columns: repeat(auto-fit, minmax(350px, 1fr)); + } + + .model-actions { + flex-wrap: wrap; + } + + .preview-actions { + flex-wrap: wrap; + } +} + +/* Large desktop styles */ +@media (min-width: 1201px) { + .container { + max-width: 1400px; + } + + .form-grid { + grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); + } + + .categories-grid { + grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); + } + + .task-options .tasks-list { + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + } +} + +/* Small mobile styles */ +@media (max-width: 480px) { + .container { + padding: 8px; + } + + header { + padding: 20px 12px; + border-radius: 8px; + } + + header h1 { + font-size: 1.8rem; + } + + header p { + font-size: 0.95rem; + } + + .card { + padding: 12px; + border-radius: 8px; + } + + .card h2 { + font-size: 1.2rem; + margin-bottom: 12px; + } + + .section-description { + font-size: 0.9rem; + margin-bottom: 16px; + } + + .model-config-item { + padding: 12px; + border-radius: 8px; + } + + .model-config-item h3 { + font-size: 1.1rem; + margin-bottom: 12px; + } + + .form-group input, + .form-group select, + .form-group textarea { + padding: 12px; + font-size: 1rem; + } + + .task-item { + padding: 12px 8px; + } + + .task-item label { + font-size: 1rem; + } + + .metric-chip { + font-size: 0.8rem; + padding: 4px 10px; + } + + .selected-task-item { + padding: 12px; + } + + #config-preview { + font-size: 0.75rem; + padding: 10px; + border-radius: 6px; + } +} + +/* Advanced config styling */ +.advanced-config { + margin-top: 16px; + border: 1px solid var(--border-color); + border-radius: 8px; +} + +.advanced-config summary { + padding: 12px 16px; + background: var(--background); + cursor: pointer; + font-weight: 500; + color: var(--text-primary); + border-radius: 8px 8px 0 0; +} + +.advanced-config summary:hover { + background: #f1f5f9; +} + +.advanced-config[open] summary { + border-bottom: 1px solid var(--border-color); + border-radius: 8px 8px 0 0; + margin-bottom: 0; +} + +.advanced-config .form-grid { + padding: 16px; +} + +/* Task selection UI */ +.task-selection-ui { + margin-bottom: 24px; +} + +.task-options { + margin-top: 16px; + padding: 16px; + background: var(--background); + border-radius: 8px; + border: 1px solid var(--border-color); +} + +.task-options h4 { + margin: 0 0 8px 0; + color: var(--primary-color); + font-size: 1.1rem; +} + +.task-options p { + margin: 0 0 16px 0; + color: var(--text-secondary); + font-size: 0.9rem; +} + +.task-options .tasks-list { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + gap: 12px; +} + +.category-group { + margin-bottom: 32px; + padding-bottom: 24px; + border-bottom: 1px solid var(--border-color); +} + +.category-group:last-child { + border-bottom: none; + margin-bottom: 0; +} + +.category-group h4 { + margin: 0 0 8px 0; + color: var(--primary-color); + font-size: 1.2rem; +} + +.category-group p { + margin: 0 0 16px 0; + color: var(--text-secondary); + font-size: 0.95rem; +} diff --git a/ui/tasks.js b/ui/tasks.js new file mode 100644 index 0000000..3369cea --- /dev/null +++ b/ui/tasks.js @@ -0,0 +1,1318 @@ +// Auto-generated by generate_tasks.py +// This file allows the UI to work when opening index.html directly in a browser +window.TASKS_DATA = { + "audio_understanding": { + "name": "🔊 Audio Understanding", + "description": "Tasks that require understanding of the general audio signals including but not limited to music, noise, sound.", + "tasks": { + "music_understanding": { + "name": "Music Understanding", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mu_chomusic_test" + ] + }, + "scene_understanding": { + "name": "Scene Understanding", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "audiocaps_qa_test", + "audiocaps_test", + "clotho_aqa_test", + "wavcaps_qa_test", + "wavcaps_test" + ] + } + } + }, + "paralinguistics": { + "name": "🎭 Paralinguistics", + "description": "Tasks that analyze non-verbal aspects of speech such as emotion, gender, accent, and speaker characteristics.", + "tasks": { + "accent_recognition": { + "name": "Accent Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mnsc_pqa_ar_dialogue_test", + "mnsc_pqa_ar_sentence_test", + "voxceleb_accent_test" + ] + }, + "emotion_recognition": { + "name": "Emotion Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_emotion_recognition", + "meld_emotion_test", + "meld_sentiment_test" + ] + }, + "gender_recognition": { + "name": "Gender Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_gender_recognition", + "mnsc_pqa_gr_dialogue_test", + "mnsc_pqa_gr_sentence_test", + "voxceleb_gender_test" + ] + }, + "speaker_diarization": { + "name": "Speaker Diarization", + "metrics": [ + "diarization_metrics" + ], + "configs": [ + "callhome_diarization_deu", + "callhome_diarization_eng", + "callhome_diarization_jpn", + "callhome_diarization_spa", + "callhome_diarization_zho" + ] + }, + "speaker_recognition": { + "name": "Speaker Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mmau_mini" + ] + } + } + }, + "phonetics": { + "name": "📢 Phonetics", + "description": "Tasks related to phonetic analysis, phoneme recognition, and speech sound processing.", + "tasks": { + "phonemes": { + "name": "Phonemes", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "voxangeles_phoneme_counting" + ] + } + } + }, + "safety_and_security": { + "name": "🔐 Safety and Security", + "description": "Tasks related to assessing model behavior around safety, robustness, and vulnerability to spoofing or adversarial content.", + "tasks": { + "safety": { + "name": "Safety", + "metrics": [ + "llm_judge_redteaming" + ], + "configs": [ + "advbench" + ] + }, + "spoofing": { + "name": "Spoofing", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "asvspoof" + ] + } + } + }, + "speech_disorder": { + "name": "🩺 Speech Disorder", + "description": "Tasks related to detecting and analyzing speech disorders and voice pathologies.", + "tasks": { + "voice_disorder": { + "name": "Voice Disorder", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "stuttering_detection" + ] + } + } + }, + "speech_enhancement": { + "name": "✨ Speech Enhancement", + "description": "Tasks related to speech quality improvement, noise detection, and audio enhancement.", + "tasks": { + "noise_detection": { + "name": "Noise Detection", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "noise_detection" + ] + } + } + }, + "speech_recognition": { + "name": "🗣️ Speech Recognition", + "description": "Tasks involving automatic speech recognition (ASR), including standard ASR, long-form ASR, and code-switching ASR.", + "tasks": { + "asr": { + "name": "Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "aishell_1_test", + "ami_ihm", + "ami_sdm", + "callhome_asr_deu", + "callhome_asr_eng", + "callhome_asr_jpn", + "callhome_asr_spa", + "callhome_asr_zho", + "common_voice_15_ab", + "common_voice_15_af", + "common_voice_15_am", + "common_voice_15_ar", + "common_voice_15_as", + "common_voice_15_ast", + "common_voice_15_az", + "common_voice_15_ba", + "common_voice_15_bas", + "common_voice_15_be", + "common_voice_15_bg", + "common_voice_15_bn", + "common_voice_15_br", + "common_voice_15_ca", + "common_voice_15_ckb", + "common_voice_15_cnh", + "common_voice_15_cs", + "common_voice_15_cv", + "common_voice_15_cy", + "common_voice_15_da", + "common_voice_15_de", + "common_voice_15_dv", + "common_voice_15_dyu", + "common_voice_15_el", + "common_voice_15_en", + "common_voice_15_eo", + "common_voice_15_es", + "common_voice_15_et", + "common_voice_15_eu", + "common_voice_15_fa", + "common_voice_15_fi", + "common_voice_15_fr", + "common_voice_15_fy-NL", + "common_voice_15_ga-IE", + "common_voice_15_gl", + "common_voice_15_ha", + "common_voice_15_hi", + "common_voice_15_hsb", + "common_voice_15_hu", + "common_voice_15_hy-AM", + "common_voice_15_ia", + "common_voice_15_id", + "common_voice_15_ig", + "common_voice_15_it", + "common_voice_15_ja", + "common_voice_15_ka", + "common_voice_15_kab", + "common_voice_15_kk", + "common_voice_15_kmr", + "common_voice_15_ko", + "common_voice_15_ky", + "common_voice_15_lg", + "common_voice_15_lt", + "common_voice_15_lv", + "common_voice_15_mdf", + "common_voice_15_mg", + "common_voice_15_mk", + "common_voice_15_ml", + "common_voice_15_mn", + "common_voice_15_mr", + "common_voice_15_mt", + "common_voice_15_myv", + "common_voice_15_ne-NP", + "common_voice_15_nl", + "common_voice_15_nn-NO", + "common_voice_15_or", + "common_voice_15_pa-IN", + "common_voice_15_pl", + "common_voice_15_pt", + "common_voice_15_rm-sursilv", + "common_voice_15_rm-vallader", + "common_voice_15_ro", + "common_voice_15_ru", + "common_voice_15_rw", + "common_voice_15_sah", + "common_voice_15_sat", + "common_voice_15_sc", + "common_voice_15_sk", + "common_voice_15_sl", + "common_voice_15_sr", + "common_voice_15_sv-SE", + "common_voice_15_sw", + "common_voice_15_ta", + "common_voice_15_te", + "common_voice_15_tg", + "common_voice_15_th", + "common_voice_15_ti", + "common_voice_15_tok", + "common_voice_15_tr", + "common_voice_15_tt", + "common_voice_15_ug", + "common_voice_15_uk", + "common_voice_15_ur", + "common_voice_15_uz", + "common_voice_15_vot", + "common_voice_15_yi", + "common_voice_15_yue", + "common_voice_15_zh-CN", + "common_voice_15_zh-HK", + "common_voice_15_zh-TW", + "fleurs_af_za", + "fleurs_am_et", + "fleurs_ar_eg", + "fleurs_as_in", + "fleurs_ast_es", + "fleurs_az_az", + "fleurs_be_by", + "fleurs_bg_bg", + "fleurs_bn_in", + "fleurs_bs_ba", + "fleurs_ca_es", + "fleurs_ceb_ph", + "fleurs_ckb_iq", + "fleurs_cmn_hans_cn", + "fleurs_cs_cz", + "fleurs_cy_gb", + "fleurs_da_dk", + "fleurs_de_de", + "fleurs_el_gr", + "fleurs_en_us", + "fleurs_es_419", + "fleurs_et_ee", + "fleurs_fa_ir", + "fleurs_ff_sn", + "fleurs_fi_fi", + "fleurs_fil_ph", + "fleurs_fr_fr", + "fleurs_ga_ie", + "fleurs_gl_es", + "fleurs_gu_in", + "fleurs_ha_ng", + "fleurs_he_il", + "fleurs_hi_in", + "fleurs_hr_hr", + "fleurs_hu_hu", + "fleurs_hy_am", + "fleurs_id_id", + "fleurs_ig_ng", + "fleurs_is_is", + "fleurs_it_it", + "fleurs_ja_jp", + "fleurs_jv_id", + "fleurs_ka_ge", + "fleurs_kam_ke", + "fleurs_kea_cv", + "fleurs_kk_kz", + "fleurs_km_kh", + "fleurs_kn_in", + "fleurs_ko_kr", + "fleurs_ky_kg", + "fleurs_lb_lu", + "fleurs_lg_ug", + "fleurs_ln_cd", + "fleurs_lo_la", + "fleurs_lt_lt", + "fleurs_luo_ke", + "fleurs_lv_lv", + "fleurs_mi_nz", + "fleurs_mk_mk", + "fleurs_ml_in", + "fleurs_mn_mn", + "fleurs_mr_in", + "fleurs_ms_my", + "fleurs_mt_mt", + "fleurs_my_mm", + "fleurs_nb_no", + "fleurs_ne_np", + "fleurs_nl_nl", + "fleurs_nso_za", + "fleurs_ny_mw", + "fleurs_oc_fr", + "fleurs_om_et", + "fleurs_or_in", + "fleurs_pa_in", + "fleurs_pl_pl", + "fleurs_ps_af", + "fleurs_pt_br", + "fleurs_ro_ro", + "fleurs_ru_ru", + "fleurs_sd_in", + "fleurs_sk_sk", + "fleurs_sl_si", + "fleurs_sn_zw", + "fleurs_so_so", + "fleurs_sr_rs", + "fleurs_sv_se", + "fleurs_sw_ke", + "fleurs_ta_in", + "fleurs_te_in", + "fleurs_tg_tj", + "fleurs_th_th", + "fleurs_tr_tr", + "fleurs_uk_ua", + "fleurs_umb_ao", + "fleurs_ur_pk", + "fleurs_uz_uz", + "fleurs_vi_vn", + "fleurs_wo_sn", + "fleurs_xh_za", + "fleurs_yo_ng", + "fleurs_yue_hant_hk", + "fleurs_zu_za", + "gigaspeech_test", + "gigaspeech2_id_test", + "gigaspeech2_th_test", + "gigaspeech2_vi_test", + "librispeech_test_clean", + "librispeech_test_other", + "librispeech_multilingual_dutch", + "librispeech_multilingual_french", + "librispeech_multilingual_german", + "librispeech_multilingual_italian", + "librispeech_multilingual_polish", + "librispeech_multilingual_portuguese", + "librispeech_multilingual_spanish", + "mnsc_asr_part1_test", + "mnsc_asr_part2_test", + "mnsc_asr_part3_test", + "mnsc_asr_part4_test", + "mnsc_asr_part5_test", + "mnsc_asr_part6_test", + "peoples_speech_test", + "spgispeech_test", + "tedlium3_test", + "voxpopuli_cs", + "voxpopuli_de", + "voxpopuli_en", + "voxpopuli_en_accented", + "voxpopuli_es", + "voxpopuli_et", + "voxpopuli_fi", + "voxpopuli_fr", + "voxpopuli_hr", + "voxpopuli_hu", + "voxpopuli_it", + "voxpopuli_lt", + "voxpopuli_nl", + "voxpopuli_pl", + "voxpopuli_ro", + "voxpopuli_sk", + "voxpopuli_sl" + ] + }, + "code_switching_asr": { + "name": "Code Switching Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "seame_dev_man", + "seame_dev_sge" + ] + }, + "long_form_asr": { + "name": "Long Form Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "earnings21", + "earnings22", + "tedlium3_long_form" + ] + } + } + }, + "spoken_language_reasoning": { + "name": "🧩 Spoken Language Reasoning", + "description": "Tasks that require reasoning over spoken input, such as instruction following or logical/mathematical reasoning.", + "tasks": { + "bfcl": { + "name": "Bfcl", + "metrics": [ + "bfcl_match_score" + ], + "configs": [ + "bfcl_audio_irrelevance", + "bfcl_audio_multiple", + "bfcl_audio_parallel", + "bfcl_audio_parallel_multiple", + "bfcl_audio_simple", + "bfcl_audio_irrelevance_no_prompt", + "bfcl_audio_multiple_no_prompt", + "bfcl_audio_parallel_multiple_no_prompt", + "bfcl_audio_parallel_no_prompt", + "bfcl_audio_simple_no_prompt", + "bfcl_text_irrelevance", + "bfcl_text_multiple", + "bfcl_text_parallel", + "bfcl_text_parallel_multiple", + "bfcl_text_simple", + "bfcl_text_irrelevance_no_prompt", + "bfcl_text_multiple_no_prompt", + "bfcl_text_parallel_multiple_no_prompt", + "bfcl_text_parallel_no_prompt", + "bfcl_text_simple_no_prompt" + ] + }, + "gsm8k": { + "name": "Gsm8K", + "metrics": [ + "gsm8k_exact_match" + ], + "configs": [ + "gsm8k_audio", + "gsm8k_text" + ] + }, + "ifeval": { + "name": "Ifeval", + "metrics": [ + "instruction_following" + ], + "configs": [ + "voicebench_ifeval_audio", + "voicebench_ifeval_text" + ] + }, + "mtbench": { + "name": "Mtbench", + "metrics": [ + "mt_bench_llm_judge" + ], + "configs": [ + "mtbench_audio", + "mtbench_text" + ] + }, + "speech_to_sql": { + "name": "Speech To Sql", + "metrics": [ + "sql_score" + ], + "configs": [ + "spider_audio", + "spider_text" + ] + } + } + }, + "spoken_language_understanding": { + "name": "🧠 Spoken Language Understanding", + "description": "Tasks that require understanding of spoken language and/or audio information including QA, translation, summarization, and intent classification.", + "tasks": { + "intent_classification": { + "name": "Intent Classification", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "SLURP-intent" + ] + }, + "speech_qa": { + "name": "Speech Qa", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "alpaca_audio_test", + "cn_college_listen_mcq_test", + "dream_tts_mcq_test", + "mnsc_sqa_part3_test", + "mnsc_sqa_part4_test", + "mnsc_sqa_part5_test", + "mnsc_sqa_part6_test", + "openhermes_instruction_test", + "public_sg_speech_qa_test", + "slue_p2_sqa5_test", + "spoken_squad_test" + ] + }, + "spoken_dialogue": { + "name": "Spoken Dialogue", + "metrics": [ + "joint_goal_accuracy", + "slot_accuracy", + "slot_f1", + "bleu" + ], + "configs": [ + "spokenwoz_audio", + "spokenwoz_text" + ] + }, + "spoken_dialogue_summarization": { + "name": "Spoken Dialogue Summarization", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "mnsc_sds_part3_test", + "mnsc_sds_part4_test", + "mnsc_sds_part5_test" + ] + }, + "sqqa": { + "name": "Sqqa", + "metrics": [ + "llm_judge_big_bench_audio" + ], + "configs": [ + "big_bench_audio_audio_query", + "big_bench_audio_text_query", + "mmsu_biology", + "mmsu_business", + "mmsu_chemistry", + "mmsu_economics", + "mmsu_engineering", + "mmsu_health", + "mmsu_history", + "mmsu_law", + "mmsu_other", + "mmsu_philosophy", + "mmsu_physics", + "mmsu_psychology", + "openbookqa", + "sd-qa_aus_audio", + "sd-qa_aus_text", + "sd-qa_gbr_audio", + "sd-qa_gbr_text", + "sd-qa_ind_n_audio", + "sd-qa_ind_n_text", + "sd-qa_ind_s_audio", + "sd-qa_ind_s_text", + "sd-qa_irl_audio", + "sd-qa_irl_text", + "sd-qa_kenya_audio", + "sd-qa_kenya_text", + "sd-qa_nga_audio", + "sd-qa_nga_text", + "sd-qa_nzl_audio", + "sd-qa_nzl_text", + "sd-qa_phl_audio", + "sd-qa_phl_text", + "sd-qa_usa_audio", + "sd-qa_usa_text", + "sd-qa_zaf_audio", + "sd-qa_zaf_text" + ] + }, + "translation": { + "name": "Translation", + "metrics": [ + "bleu", + "meteor", + "bertscore", + "comet" + ], + "configs": [ + "covost2_ar_en", + "covost2_ca_en", + "covost2_cy_en", + "covost2_de_en", + "covost2_en_ar", + "covost2_en_ca", + "covost2_en_cy", + "covost2_en_de", + "covost2_en_et", + "covost2_en_fa", + "covost2_en_id", + "covost2_en_ja", + "covost2_en_lv", + "covost2_en_mn", + "covost2_en_sl", + "covost2_en_sv-SE", + "covost2_en_ta", + "covost2_en_tr", + "covost2_en_zh-CN", + "covost2_es_en", + "covost2_et_en", + "covost2_fa_en", + "covost2_fr_en", + "covost2_id_en", + "covost2_it_en", + "covost2_ja_en", + "covost2_lv_en", + "covost2_mn_en", + "covost2_nl_en", + "covost2_pt_en", + "covost2_ru_en", + "covost2_sl_en", + "covost2_sv-SE_en", + "covost2_ta_en", + "covost2_tr_en", + "covost2_zh-CN_en" + ] + } + } + }, + "music_understanding": { + "name": "Music Understanding", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mu_chomusic_test" + ], + "category": "audio_understanding" + }, + "scene_understanding": { + "name": "Scene Understanding", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "audiocaps_qa_test", + "audiocaps_test", + "clotho_aqa_test", + "wavcaps_qa_test", + "wavcaps_test" + ], + "category": "audio_understanding" + }, + "accent_recognition": { + "name": "Accent Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mnsc_pqa_ar_dialogue_test", + "mnsc_pqa_ar_sentence_test", + "voxceleb_accent_test" + ], + "category": "paralinguistics" + }, + "emotion_recognition": { + "name": "Emotion Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_emotion_recognition", + "meld_emotion_test", + "meld_sentiment_test" + ], + "category": "paralinguistics" + }, + "gender_recognition": { + "name": "Gender Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_gender_recognition", + "mnsc_pqa_gr_dialogue_test", + "mnsc_pqa_gr_sentence_test", + "voxceleb_gender_test" + ], + "category": "paralinguistics" + }, + "speaker_diarization": { + "name": "Speaker Diarization", + "metrics": [ + "diarization_metrics" + ], + "configs": [ + "callhome_diarization_deu", + "callhome_diarization_eng", + "callhome_diarization_jpn", + "callhome_diarization_spa", + "callhome_diarization_zho" + ], + "category": "paralinguistics" + }, + "speaker_recognition": { + "name": "Speaker Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mmau_mini" + ], + "category": "paralinguistics" + }, + "phonemes": { + "name": "Phonemes", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "voxangeles_phoneme_counting" + ], + "category": "phonetics" + }, + "safety": { + "name": "Safety", + "metrics": [ + "llm_judge_redteaming" + ], + "configs": [ + "advbench" + ], + "category": "safety_and_security" + }, + "spoofing": { + "name": "Spoofing", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "asvspoof" + ], + "category": "safety_and_security" + }, + "voice_disorder": { + "name": "Voice Disorder", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "stuttering_detection" + ], + "category": "speech_disorder" + }, + "noise_detection": { + "name": "Noise Detection", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "noise_detection" + ], + "category": "speech_enhancement" + }, + "asr": { + "name": "Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "aishell_1_test", + "ami_ihm", + "ami_sdm", + "callhome_asr_deu", + "callhome_asr_eng", + "callhome_asr_jpn", + "callhome_asr_spa", + "callhome_asr_zho", + "common_voice_15_ab", + "common_voice_15_af", + "common_voice_15_am", + "common_voice_15_ar", + "common_voice_15_as", + "common_voice_15_ast", + "common_voice_15_az", + "common_voice_15_ba", + "common_voice_15_bas", + "common_voice_15_be", + "common_voice_15_bg", + "common_voice_15_bn", + "common_voice_15_br", + "common_voice_15_ca", + "common_voice_15_ckb", + "common_voice_15_cnh", + "common_voice_15_cs", + "common_voice_15_cv", + "common_voice_15_cy", + "common_voice_15_da", + "common_voice_15_de", + "common_voice_15_dv", + "common_voice_15_dyu", + "common_voice_15_el", + "common_voice_15_en", + "common_voice_15_eo", + "common_voice_15_es", + "common_voice_15_et", + "common_voice_15_eu", + "common_voice_15_fa", + "common_voice_15_fi", + "common_voice_15_fr", + "common_voice_15_fy-NL", + "common_voice_15_ga-IE", + "common_voice_15_gl", + "common_voice_15_ha", + "common_voice_15_hi", + "common_voice_15_hsb", + "common_voice_15_hu", + "common_voice_15_hy-AM", + "common_voice_15_ia", + "common_voice_15_id", + "common_voice_15_ig", + "common_voice_15_it", + "common_voice_15_ja", + "common_voice_15_ka", + "common_voice_15_kab", + "common_voice_15_kk", + "common_voice_15_kmr", + "common_voice_15_ko", + "common_voice_15_ky", + "common_voice_15_lg", + "common_voice_15_lt", + "common_voice_15_lv", + "common_voice_15_mdf", + "common_voice_15_mg", + "common_voice_15_mk", + "common_voice_15_ml", + "common_voice_15_mn", + "common_voice_15_mr", + "common_voice_15_mt", + "common_voice_15_myv", + "common_voice_15_ne-NP", + "common_voice_15_nl", + "common_voice_15_nn-NO", + "common_voice_15_or", + "common_voice_15_pa-IN", + "common_voice_15_pl", + "common_voice_15_pt", + "common_voice_15_rm-sursilv", + "common_voice_15_rm-vallader", + "common_voice_15_ro", + "common_voice_15_ru", + "common_voice_15_rw", + "common_voice_15_sah", + "common_voice_15_sat", + "common_voice_15_sc", + "common_voice_15_sk", + "common_voice_15_sl", + "common_voice_15_sr", + "common_voice_15_sv-SE", + "common_voice_15_sw", + "common_voice_15_ta", + "common_voice_15_te", + "common_voice_15_tg", + "common_voice_15_th", + "common_voice_15_ti", + "common_voice_15_tok", + "common_voice_15_tr", + "common_voice_15_tt", + "common_voice_15_ug", + "common_voice_15_uk", + "common_voice_15_ur", + "common_voice_15_uz", + "common_voice_15_vot", + "common_voice_15_yi", + "common_voice_15_yue", + "common_voice_15_zh-CN", + "common_voice_15_zh-HK", + "common_voice_15_zh-TW", + "fleurs_af_za", + "fleurs_am_et", + "fleurs_ar_eg", + "fleurs_as_in", + "fleurs_ast_es", + "fleurs_az_az", + "fleurs_be_by", + "fleurs_bg_bg", + "fleurs_bn_in", + "fleurs_bs_ba", + "fleurs_ca_es", + "fleurs_ceb_ph", + "fleurs_ckb_iq", + "fleurs_cmn_hans_cn", + "fleurs_cs_cz", + "fleurs_cy_gb", + "fleurs_da_dk", + "fleurs_de_de", + "fleurs_el_gr", + "fleurs_en_us", + "fleurs_es_419", + "fleurs_et_ee", + "fleurs_fa_ir", + "fleurs_ff_sn", + "fleurs_fi_fi", + "fleurs_fil_ph", + "fleurs_fr_fr", + "fleurs_ga_ie", + "fleurs_gl_es", + "fleurs_gu_in", + "fleurs_ha_ng", + "fleurs_he_il", + "fleurs_hi_in", + "fleurs_hr_hr", + "fleurs_hu_hu", + "fleurs_hy_am", + "fleurs_id_id", + "fleurs_ig_ng", + "fleurs_is_is", + "fleurs_it_it", + "fleurs_ja_jp", + "fleurs_jv_id", + "fleurs_ka_ge", + "fleurs_kam_ke", + "fleurs_kea_cv", + "fleurs_kk_kz", + "fleurs_km_kh", + "fleurs_kn_in", + "fleurs_ko_kr", + "fleurs_ky_kg", + "fleurs_lb_lu", + "fleurs_lg_ug", + "fleurs_ln_cd", + "fleurs_lo_la", + "fleurs_lt_lt", + "fleurs_luo_ke", + "fleurs_lv_lv", + "fleurs_mi_nz", + "fleurs_mk_mk", + "fleurs_ml_in", + "fleurs_mn_mn", + "fleurs_mr_in", + "fleurs_ms_my", + "fleurs_mt_mt", + "fleurs_my_mm", + "fleurs_nb_no", + "fleurs_ne_np", + "fleurs_nl_nl", + "fleurs_nso_za", + "fleurs_ny_mw", + "fleurs_oc_fr", + "fleurs_om_et", + "fleurs_or_in", + "fleurs_pa_in", + "fleurs_pl_pl", + "fleurs_ps_af", + "fleurs_pt_br", + "fleurs_ro_ro", + "fleurs_ru_ru", + "fleurs_sd_in", + "fleurs_sk_sk", + "fleurs_sl_si", + "fleurs_sn_zw", + "fleurs_so_so", + "fleurs_sr_rs", + "fleurs_sv_se", + "fleurs_sw_ke", + "fleurs_ta_in", + "fleurs_te_in", + "fleurs_tg_tj", + "fleurs_th_th", + "fleurs_tr_tr", + "fleurs_uk_ua", + "fleurs_umb_ao", + "fleurs_ur_pk", + "fleurs_uz_uz", + "fleurs_vi_vn", + "fleurs_wo_sn", + "fleurs_xh_za", + "fleurs_yo_ng", + "fleurs_yue_hant_hk", + "fleurs_zu_za", + "gigaspeech_test", + "gigaspeech2_id_test", + "gigaspeech2_th_test", + "gigaspeech2_vi_test", + "librispeech_test_clean", + "librispeech_test_other", + "librispeech_multilingual_dutch", + "librispeech_multilingual_french", + "librispeech_multilingual_german", + "librispeech_multilingual_italian", + "librispeech_multilingual_polish", + "librispeech_multilingual_portuguese", + "librispeech_multilingual_spanish", + "mnsc_asr_part1_test", + "mnsc_asr_part2_test", + "mnsc_asr_part3_test", + "mnsc_asr_part4_test", + "mnsc_asr_part5_test", + "mnsc_asr_part6_test", + "peoples_speech_test", + "spgispeech_test", + "tedlium3_test", + "voxpopuli_cs", + "voxpopuli_de", + "voxpopuli_en", + "voxpopuli_en_accented", + "voxpopuli_es", + "voxpopuli_et", + "voxpopuli_fi", + "voxpopuli_fr", + "voxpopuli_hr", + "voxpopuli_hu", + "voxpopuli_it", + "voxpopuli_lt", + "voxpopuli_nl", + "voxpopuli_pl", + "voxpopuli_ro", + "voxpopuli_sk", + "voxpopuli_sl" + ], + "category": "speech_recognition" + }, + "code_switching_asr": { + "name": "Code Switching Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "seame_dev_man", + "seame_dev_sge" + ], + "category": "speech_recognition" + }, + "long_form_asr": { + "name": "Long Form Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "earnings21", + "earnings22", + "tedlium3_long_form" + ], + "category": "speech_recognition" + }, + "bfcl": { + "name": "Bfcl", + "metrics": [ + "bfcl_match_score" + ], + "configs": [ + "bfcl_audio_irrelevance", + "bfcl_audio_multiple", + "bfcl_audio_parallel", + "bfcl_audio_parallel_multiple", + "bfcl_audio_simple", + "bfcl_audio_irrelevance_no_prompt", + "bfcl_audio_multiple_no_prompt", + "bfcl_audio_parallel_multiple_no_prompt", + "bfcl_audio_parallel_no_prompt", + "bfcl_audio_simple_no_prompt", + "bfcl_text_irrelevance", + "bfcl_text_multiple", + "bfcl_text_parallel", + "bfcl_text_parallel_multiple", + "bfcl_text_simple", + "bfcl_text_irrelevance_no_prompt", + "bfcl_text_multiple_no_prompt", + "bfcl_text_parallel_multiple_no_prompt", + "bfcl_text_parallel_no_prompt", + "bfcl_text_simple_no_prompt" + ], + "category": "spoken_language_reasoning" + }, + "gsm8k": { + "name": "Gsm8K", + "metrics": [ + "gsm8k_exact_match" + ], + "configs": [ + "gsm8k_audio", + "gsm8k_text" + ], + "category": "spoken_language_reasoning" + }, + "ifeval": { + "name": "Ifeval", + "metrics": [ + "instruction_following" + ], + "configs": [ + "voicebench_ifeval_audio", + "voicebench_ifeval_text" + ], + "category": "spoken_language_reasoning" + }, + "mtbench": { + "name": "Mtbench", + "metrics": [ + "mt_bench_llm_judge" + ], + "configs": [ + "mtbench_audio", + "mtbench_text" + ], + "category": "spoken_language_reasoning" + }, + "speech_to_sql": { + "name": "Speech To Sql", + "metrics": [ + "sql_score" + ], + "configs": [ + "spider_audio", + "spider_text" + ], + "category": "spoken_language_reasoning" + }, + "intent_classification": { + "name": "Intent Classification", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "SLURP-intent" + ], + "category": "spoken_language_understanding" + }, + "speech_qa": { + "name": "Speech Qa", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "alpaca_audio_test", + "cn_college_listen_mcq_test", + "dream_tts_mcq_test", + "mnsc_sqa_part3_test", + "mnsc_sqa_part4_test", + "mnsc_sqa_part5_test", + "mnsc_sqa_part6_test", + "openhermes_instruction_test", + "public_sg_speech_qa_test", + "slue_p2_sqa5_test", + "spoken_squad_test" + ], + "category": "spoken_language_understanding" + }, + "spoken_dialogue": { + "name": "Spoken Dialogue", + "metrics": [ + "joint_goal_accuracy", + "slot_accuracy", + "slot_f1", + "bleu" + ], + "configs": [ + "spokenwoz_audio", + "spokenwoz_text" + ], + "category": "spoken_language_understanding" + }, + "spoken_dialogue_summarization": { + "name": "Spoken Dialogue Summarization", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "mnsc_sds_part3_test", + "mnsc_sds_part4_test", + "mnsc_sds_part5_test" + ], + "category": "spoken_language_understanding" + }, + "sqqa": { + "name": "Sqqa", + "metrics": [ + "llm_judge_big_bench_audio" + ], + "configs": [ + "big_bench_audio_audio_query", + "big_bench_audio_text_query", + "mmsu_biology", + "mmsu_business", + "mmsu_chemistry", + "mmsu_economics", + "mmsu_engineering", + "mmsu_health", + "mmsu_history", + "mmsu_law", + "mmsu_other", + "mmsu_philosophy", + "mmsu_physics", + "mmsu_psychology", + "openbookqa", + "sd-qa_aus_audio", + "sd-qa_aus_text", + "sd-qa_gbr_audio", + "sd-qa_gbr_text", + "sd-qa_ind_n_audio", + "sd-qa_ind_n_text", + "sd-qa_ind_s_audio", + "sd-qa_ind_s_text", + "sd-qa_irl_audio", + "sd-qa_irl_text", + "sd-qa_kenya_audio", + "sd-qa_kenya_text", + "sd-qa_nga_audio", + "sd-qa_nga_text", + "sd-qa_nzl_audio", + "sd-qa_nzl_text", + "sd-qa_phl_audio", + "sd-qa_phl_text", + "sd-qa_usa_audio", + "sd-qa_usa_text", + "sd-qa_zaf_audio", + "sd-qa_zaf_text" + ], + "category": "spoken_language_understanding" + }, + "translation": { + "name": "Translation", + "metrics": [ + "bleu", + "meteor", + "bertscore", + "comet" + ], + "configs": [ + "covost2_ar_en", + "covost2_ca_en", + "covost2_cy_en", + "covost2_de_en", + "covost2_en_ar", + "covost2_en_ca", + "covost2_en_cy", + "covost2_en_de", + "covost2_en_et", + "covost2_en_fa", + "covost2_en_id", + "covost2_en_ja", + "covost2_en_lv", + "covost2_en_mn", + "covost2_en_sl", + "covost2_en_sv-SE", + "covost2_en_ta", + "covost2_en_tr", + "covost2_en_zh-CN", + "covost2_es_en", + "covost2_et_en", + "covost2_fa_en", + "covost2_fr_en", + "covost2_id_en", + "covost2_it_en", + "covost2_ja_en", + "covost2_lv_en", + "covost2_mn_en", + "covost2_nl_en", + "covost2_pt_en", + "covost2_ru_en", + "covost2_sl_en", + "covost2_sv-SE_en", + "covost2_ta_en", + "covost2_tr_en", + "covost2_zh-CN_en" + ], + "category": "spoken_language_understanding" + } +}; diff --git a/ui/tasks.json b/ui/tasks.json new file mode 100644 index 0000000..996b6f2 --- /dev/null +++ b/ui/tasks.json @@ -0,0 +1,1316 @@ +{ + "audio_understanding": { + "name": "🔊 Audio Understanding", + "description": "Tasks that require understanding of the general audio signals including but not limited to music, noise, sound.", + "tasks": { + "music_understanding": { + "name": "Music Understanding", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mu_chomusic_test" + ] + }, + "scene_understanding": { + "name": "Scene Understanding", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "audiocaps_qa_test", + "audiocaps_test", + "clotho_aqa_test", + "wavcaps_qa_test", + "wavcaps_test" + ] + } + } + }, + "paralinguistics": { + "name": "🎭 Paralinguistics", + "description": "Tasks that analyze non-verbal aspects of speech such as emotion, gender, accent, and speaker characteristics.", + "tasks": { + "accent_recognition": { + "name": "Accent Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mnsc_pqa_ar_dialogue_test", + "mnsc_pqa_ar_sentence_test", + "voxceleb_accent_test" + ] + }, + "emotion_recognition": { + "name": "Emotion Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_emotion_recognition", + "meld_emotion_test", + "meld_sentiment_test" + ] + }, + "gender_recognition": { + "name": "Gender Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_gender_recognition", + "mnsc_pqa_gr_dialogue_test", + "mnsc_pqa_gr_sentence_test", + "voxceleb_gender_test" + ] + }, + "speaker_diarization": { + "name": "Speaker Diarization", + "metrics": [ + "diarization_metrics" + ], + "configs": [ + "callhome_diarization_deu", + "callhome_diarization_eng", + "callhome_diarization_jpn", + "callhome_diarization_spa", + "callhome_diarization_zho" + ] + }, + "speaker_recognition": { + "name": "Speaker Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mmau_mini" + ] + } + } + }, + "phonetics": { + "name": "📢 Phonetics", + "description": "Tasks related to phonetic analysis, phoneme recognition, and speech sound processing.", + "tasks": { + "phonemes": { + "name": "Phonemes", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "voxangeles_phoneme_counting" + ] + } + } + }, + "safety_and_security": { + "name": "🔐 Safety and Security", + "description": "Tasks related to assessing model behavior around safety, robustness, and vulnerability to spoofing or adversarial content.", + "tasks": { + "safety": { + "name": "Safety", + "metrics": [ + "llm_judge_redteaming" + ], + "configs": [ + "advbench" + ] + }, + "spoofing": { + "name": "Spoofing", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "asvspoof" + ] + } + } + }, + "speech_disorder": { + "name": "🩺 Speech Disorder", + "description": "Tasks related to detecting and analyzing speech disorders and voice pathologies.", + "tasks": { + "voice_disorder": { + "name": "Voice Disorder", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "stuttering_detection" + ] + } + } + }, + "speech_enhancement": { + "name": "✨ Speech Enhancement", + "description": "Tasks related to speech quality improvement, noise detection, and audio enhancement.", + "tasks": { + "noise_detection": { + "name": "Noise Detection", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "noise_detection" + ] + } + } + }, + "speech_recognition": { + "name": "🗣️ Speech Recognition", + "description": "Tasks involving automatic speech recognition (ASR), including standard ASR, long-form ASR, and code-switching ASR.", + "tasks": { + "asr": { + "name": "Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "aishell_1_test", + "ami_ihm", + "ami_sdm", + "callhome_asr_deu", + "callhome_asr_eng", + "callhome_asr_jpn", + "callhome_asr_spa", + "callhome_asr_zho", + "common_voice_15_ab", + "common_voice_15_af", + "common_voice_15_am", + "common_voice_15_ar", + "common_voice_15_as", + "common_voice_15_ast", + "common_voice_15_az", + "common_voice_15_ba", + "common_voice_15_bas", + "common_voice_15_be", + "common_voice_15_bg", + "common_voice_15_bn", + "common_voice_15_br", + "common_voice_15_ca", + "common_voice_15_ckb", + "common_voice_15_cnh", + "common_voice_15_cs", + "common_voice_15_cv", + "common_voice_15_cy", + "common_voice_15_da", + "common_voice_15_de", + "common_voice_15_dv", + "common_voice_15_dyu", + "common_voice_15_el", + "common_voice_15_en", + "common_voice_15_eo", + "common_voice_15_es", + "common_voice_15_et", + "common_voice_15_eu", + "common_voice_15_fa", + "common_voice_15_fi", + "common_voice_15_fr", + "common_voice_15_fy-NL", + "common_voice_15_ga-IE", + "common_voice_15_gl", + "common_voice_15_ha", + "common_voice_15_hi", + "common_voice_15_hsb", + "common_voice_15_hu", + "common_voice_15_hy-AM", + "common_voice_15_ia", + "common_voice_15_id", + "common_voice_15_ig", + "common_voice_15_it", + "common_voice_15_ja", + "common_voice_15_ka", + "common_voice_15_kab", + "common_voice_15_kk", + "common_voice_15_kmr", + "common_voice_15_ko", + "common_voice_15_ky", + "common_voice_15_lg", + "common_voice_15_lt", + "common_voice_15_lv", + "common_voice_15_mdf", + "common_voice_15_mg", + "common_voice_15_mk", + "common_voice_15_ml", + "common_voice_15_mn", + "common_voice_15_mr", + "common_voice_15_mt", + "common_voice_15_myv", + "common_voice_15_ne-NP", + "common_voice_15_nl", + "common_voice_15_nn-NO", + "common_voice_15_or", + "common_voice_15_pa-IN", + "common_voice_15_pl", + "common_voice_15_pt", + "common_voice_15_rm-sursilv", + "common_voice_15_rm-vallader", + "common_voice_15_ro", + "common_voice_15_ru", + "common_voice_15_rw", + "common_voice_15_sah", + "common_voice_15_sat", + "common_voice_15_sc", + "common_voice_15_sk", + "common_voice_15_sl", + "common_voice_15_sr", + "common_voice_15_sv-SE", + "common_voice_15_sw", + "common_voice_15_ta", + "common_voice_15_te", + "common_voice_15_tg", + "common_voice_15_th", + "common_voice_15_ti", + "common_voice_15_tok", + "common_voice_15_tr", + "common_voice_15_tt", + "common_voice_15_ug", + "common_voice_15_uk", + "common_voice_15_ur", + "common_voice_15_uz", + "common_voice_15_vot", + "common_voice_15_yi", + "common_voice_15_yue", + "common_voice_15_zh-CN", + "common_voice_15_zh-HK", + "common_voice_15_zh-TW", + "fleurs_af_za", + "fleurs_am_et", + "fleurs_ar_eg", + "fleurs_as_in", + "fleurs_ast_es", + "fleurs_az_az", + "fleurs_be_by", + "fleurs_bg_bg", + "fleurs_bn_in", + "fleurs_bs_ba", + "fleurs_ca_es", + "fleurs_ceb_ph", + "fleurs_ckb_iq", + "fleurs_cmn_hans_cn", + "fleurs_cs_cz", + "fleurs_cy_gb", + "fleurs_da_dk", + "fleurs_de_de", + "fleurs_el_gr", + "fleurs_en_us", + "fleurs_es_419", + "fleurs_et_ee", + "fleurs_fa_ir", + "fleurs_ff_sn", + "fleurs_fi_fi", + "fleurs_fil_ph", + "fleurs_fr_fr", + "fleurs_ga_ie", + "fleurs_gl_es", + "fleurs_gu_in", + "fleurs_ha_ng", + "fleurs_he_il", + "fleurs_hi_in", + "fleurs_hr_hr", + "fleurs_hu_hu", + "fleurs_hy_am", + "fleurs_id_id", + "fleurs_ig_ng", + "fleurs_is_is", + "fleurs_it_it", + "fleurs_ja_jp", + "fleurs_jv_id", + "fleurs_ka_ge", + "fleurs_kam_ke", + "fleurs_kea_cv", + "fleurs_kk_kz", + "fleurs_km_kh", + "fleurs_kn_in", + "fleurs_ko_kr", + "fleurs_ky_kg", + "fleurs_lb_lu", + "fleurs_lg_ug", + "fleurs_ln_cd", + "fleurs_lo_la", + "fleurs_lt_lt", + "fleurs_luo_ke", + "fleurs_lv_lv", + "fleurs_mi_nz", + "fleurs_mk_mk", + "fleurs_ml_in", + "fleurs_mn_mn", + "fleurs_mr_in", + "fleurs_ms_my", + "fleurs_mt_mt", + "fleurs_my_mm", + "fleurs_nb_no", + "fleurs_ne_np", + "fleurs_nl_nl", + "fleurs_nso_za", + "fleurs_ny_mw", + "fleurs_oc_fr", + "fleurs_om_et", + "fleurs_or_in", + "fleurs_pa_in", + "fleurs_pl_pl", + "fleurs_ps_af", + "fleurs_pt_br", + "fleurs_ro_ro", + "fleurs_ru_ru", + "fleurs_sd_in", + "fleurs_sk_sk", + "fleurs_sl_si", + "fleurs_sn_zw", + "fleurs_so_so", + "fleurs_sr_rs", + "fleurs_sv_se", + "fleurs_sw_ke", + "fleurs_ta_in", + "fleurs_te_in", + "fleurs_tg_tj", + "fleurs_th_th", + "fleurs_tr_tr", + "fleurs_uk_ua", + "fleurs_umb_ao", + "fleurs_ur_pk", + "fleurs_uz_uz", + "fleurs_vi_vn", + "fleurs_wo_sn", + "fleurs_xh_za", + "fleurs_yo_ng", + "fleurs_yue_hant_hk", + "fleurs_zu_za", + "gigaspeech_test", + "gigaspeech2_id_test", + "gigaspeech2_th_test", + "gigaspeech2_vi_test", + "librispeech_test_clean", + "librispeech_test_other", + "librispeech_multilingual_dutch", + "librispeech_multilingual_french", + "librispeech_multilingual_german", + "librispeech_multilingual_italian", + "librispeech_multilingual_polish", + "librispeech_multilingual_portuguese", + "librispeech_multilingual_spanish", + "mnsc_asr_part1_test", + "mnsc_asr_part2_test", + "mnsc_asr_part3_test", + "mnsc_asr_part4_test", + "mnsc_asr_part5_test", + "mnsc_asr_part6_test", + "peoples_speech_test", + "spgispeech_test", + "tedlium3_test", + "voxpopuli_cs", + "voxpopuli_de", + "voxpopuli_en", + "voxpopuli_en_accented", + "voxpopuli_es", + "voxpopuli_et", + "voxpopuli_fi", + "voxpopuli_fr", + "voxpopuli_hr", + "voxpopuli_hu", + "voxpopuli_it", + "voxpopuli_lt", + "voxpopuli_nl", + "voxpopuli_pl", + "voxpopuli_ro", + "voxpopuli_sk", + "voxpopuli_sl" + ] + }, + "code_switching_asr": { + "name": "Code Switching Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "seame_dev_man", + "seame_dev_sge" + ] + }, + "long_form_asr": { + "name": "Long Form Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "earnings21", + "earnings22", + "tedlium3_long_form" + ] + } + } + }, + "spoken_language_reasoning": { + "name": "🧩 Spoken Language Reasoning", + "description": "Tasks that require reasoning over spoken input, such as instruction following or logical/mathematical reasoning.", + "tasks": { + "bfcl": { + "name": "Bfcl", + "metrics": [ + "bfcl_match_score" + ], + "configs": [ + "bfcl_audio_irrelevance", + "bfcl_audio_multiple", + "bfcl_audio_parallel", + "bfcl_audio_parallel_multiple", + "bfcl_audio_simple", + "bfcl_audio_irrelevance_no_prompt", + "bfcl_audio_multiple_no_prompt", + "bfcl_audio_parallel_multiple_no_prompt", + "bfcl_audio_parallel_no_prompt", + "bfcl_audio_simple_no_prompt", + "bfcl_text_irrelevance", + "bfcl_text_multiple", + "bfcl_text_parallel", + "bfcl_text_parallel_multiple", + "bfcl_text_simple", + "bfcl_text_irrelevance_no_prompt", + "bfcl_text_multiple_no_prompt", + "bfcl_text_parallel_multiple_no_prompt", + "bfcl_text_parallel_no_prompt", + "bfcl_text_simple_no_prompt" + ] + }, + "gsm8k": { + "name": "Gsm8K", + "metrics": [ + "gsm8k_exact_match" + ], + "configs": [ + "gsm8k_audio", + "gsm8k_text" + ] + }, + "ifeval": { + "name": "Ifeval", + "metrics": [ + "instruction_following" + ], + "configs": [ + "voicebench_ifeval_audio", + "voicebench_ifeval_text" + ] + }, + "mtbench": { + "name": "Mtbench", + "metrics": [ + "mt_bench_llm_judge" + ], + "configs": [ + "mtbench_audio", + "mtbench_text" + ] + }, + "speech_to_sql": { + "name": "Speech To Sql", + "metrics": [ + "sql_score" + ], + "configs": [ + "spider_audio", + "spider_text" + ] + } + } + }, + "spoken_language_understanding": { + "name": "🧠 Spoken Language Understanding", + "description": "Tasks that require understanding of spoken language and/or audio information including QA, translation, summarization, and intent classification.", + "tasks": { + "intent_classification": { + "name": "Intent Classification", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "SLURP-intent" + ] + }, + "speech_qa": { + "name": "Speech Qa", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "alpaca_audio_test", + "cn_college_listen_mcq_test", + "dream_tts_mcq_test", + "mnsc_sqa_part3_test", + "mnsc_sqa_part4_test", + "mnsc_sqa_part5_test", + "mnsc_sqa_part6_test", + "openhermes_instruction_test", + "public_sg_speech_qa_test", + "slue_p2_sqa5_test", + "spoken_squad_test" + ] + }, + "spoken_dialogue": { + "name": "Spoken Dialogue", + "metrics": [ + "joint_goal_accuracy", + "slot_accuracy", + "slot_f1", + "bleu" + ], + "configs": [ + "spokenwoz_audio", + "spokenwoz_text" + ] + }, + "spoken_dialogue_summarization": { + "name": "Spoken Dialogue Summarization", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "mnsc_sds_part3_test", + "mnsc_sds_part4_test", + "mnsc_sds_part5_test" + ] + }, + "sqqa": { + "name": "Sqqa", + "metrics": [ + "llm_judge_big_bench_audio" + ], + "configs": [ + "big_bench_audio_audio_query", + "big_bench_audio_text_query", + "mmsu_biology", + "mmsu_business", + "mmsu_chemistry", + "mmsu_economics", + "mmsu_engineering", + "mmsu_health", + "mmsu_history", + "mmsu_law", + "mmsu_other", + "mmsu_philosophy", + "mmsu_physics", + "mmsu_psychology", + "openbookqa", + "sd-qa_aus_audio", + "sd-qa_aus_text", + "sd-qa_gbr_audio", + "sd-qa_gbr_text", + "sd-qa_ind_n_audio", + "sd-qa_ind_n_text", + "sd-qa_ind_s_audio", + "sd-qa_ind_s_text", + "sd-qa_irl_audio", + "sd-qa_irl_text", + "sd-qa_kenya_audio", + "sd-qa_kenya_text", + "sd-qa_nga_audio", + "sd-qa_nga_text", + "sd-qa_nzl_audio", + "sd-qa_nzl_text", + "sd-qa_phl_audio", + "sd-qa_phl_text", + "sd-qa_usa_audio", + "sd-qa_usa_text", + "sd-qa_zaf_audio", + "sd-qa_zaf_text" + ] + }, + "translation": { + "name": "Translation", + "metrics": [ + "bleu", + "meteor", + "bertscore", + "comet" + ], + "configs": [ + "covost2_ar_en", + "covost2_ca_en", + "covost2_cy_en", + "covost2_de_en", + "covost2_en_ar", + "covost2_en_ca", + "covost2_en_cy", + "covost2_en_de", + "covost2_en_et", + "covost2_en_fa", + "covost2_en_id", + "covost2_en_ja", + "covost2_en_lv", + "covost2_en_mn", + "covost2_en_sl", + "covost2_en_sv-SE", + "covost2_en_ta", + "covost2_en_tr", + "covost2_en_zh-CN", + "covost2_es_en", + "covost2_et_en", + "covost2_fa_en", + "covost2_fr_en", + "covost2_id_en", + "covost2_it_en", + "covost2_ja_en", + "covost2_lv_en", + "covost2_mn_en", + "covost2_nl_en", + "covost2_pt_en", + "covost2_ru_en", + "covost2_sl_en", + "covost2_sv-SE_en", + "covost2_ta_en", + "covost2_tr_en", + "covost2_zh-CN_en" + ] + } + } + }, + "music_understanding": { + "name": "Music Understanding", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mu_chomusic_test" + ], + "category": "audio_understanding" + }, + "scene_understanding": { + "name": "Scene Understanding", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "audiocaps_qa_test", + "audiocaps_test", + "clotho_aqa_test", + "wavcaps_qa_test", + "wavcaps_test" + ], + "category": "audio_understanding" + }, + "accent_recognition": { + "name": "Accent Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mnsc_pqa_ar_dialogue_test", + "mnsc_pqa_ar_sentence_test", + "voxceleb_accent_test" + ], + "category": "paralinguistics" + }, + "emotion_recognition": { + "name": "Emotion Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_emotion_recognition", + "meld_emotion_test", + "meld_sentiment_test" + ], + "category": "paralinguistics" + }, + "gender_recognition": { + "name": "Gender Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "iemocap_gender_recognition", + "mnsc_pqa_gr_dialogue_test", + "mnsc_pqa_gr_sentence_test", + "voxceleb_gender_test" + ], + "category": "paralinguistics" + }, + "speaker_diarization": { + "name": "Speaker Diarization", + "metrics": [ + "diarization_metrics" + ], + "configs": [ + "callhome_diarization_deu", + "callhome_diarization_eng", + "callhome_diarization_jpn", + "callhome_diarization_spa", + "callhome_diarization_zho" + ], + "category": "paralinguistics" + }, + "speaker_recognition": { + "name": "Speaker Recognition", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "mmau_mini" + ], + "category": "paralinguistics" + }, + "phonemes": { + "name": "Phonemes", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "voxangeles_phoneme_counting" + ], + "category": "phonetics" + }, + "safety": { + "name": "Safety", + "metrics": [ + "llm_judge_redteaming" + ], + "configs": [ + "advbench" + ], + "category": "safety_and_security" + }, + "spoofing": { + "name": "Spoofing", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "asvspoof" + ], + "category": "safety_and_security" + }, + "voice_disorder": { + "name": "Voice Disorder", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "stuttering_detection" + ], + "category": "speech_disorder" + }, + "noise_detection": { + "name": "Noise Detection", + "metrics": [ + "llm_judge_binary", + "detailed_judge_prompt" + ], + "configs": [ + "noise_detection" + ], + "category": "speech_enhancement" + }, + "asr": { + "name": "Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "aishell_1_test", + "ami_ihm", + "ami_sdm", + "callhome_asr_deu", + "callhome_asr_eng", + "callhome_asr_jpn", + "callhome_asr_spa", + "callhome_asr_zho", + "common_voice_15_ab", + "common_voice_15_af", + "common_voice_15_am", + "common_voice_15_ar", + "common_voice_15_as", + "common_voice_15_ast", + "common_voice_15_az", + "common_voice_15_ba", + "common_voice_15_bas", + "common_voice_15_be", + "common_voice_15_bg", + "common_voice_15_bn", + "common_voice_15_br", + "common_voice_15_ca", + "common_voice_15_ckb", + "common_voice_15_cnh", + "common_voice_15_cs", + "common_voice_15_cv", + "common_voice_15_cy", + "common_voice_15_da", + "common_voice_15_de", + "common_voice_15_dv", + "common_voice_15_dyu", + "common_voice_15_el", + "common_voice_15_en", + "common_voice_15_eo", + "common_voice_15_es", + "common_voice_15_et", + "common_voice_15_eu", + "common_voice_15_fa", + "common_voice_15_fi", + "common_voice_15_fr", + "common_voice_15_fy-NL", + "common_voice_15_ga-IE", + "common_voice_15_gl", + "common_voice_15_ha", + "common_voice_15_hi", + "common_voice_15_hsb", + "common_voice_15_hu", + "common_voice_15_hy-AM", + "common_voice_15_ia", + "common_voice_15_id", + "common_voice_15_ig", + "common_voice_15_it", + "common_voice_15_ja", + "common_voice_15_ka", + "common_voice_15_kab", + "common_voice_15_kk", + "common_voice_15_kmr", + "common_voice_15_ko", + "common_voice_15_ky", + "common_voice_15_lg", + "common_voice_15_lt", + "common_voice_15_lv", + "common_voice_15_mdf", + "common_voice_15_mg", + "common_voice_15_mk", + "common_voice_15_ml", + "common_voice_15_mn", + "common_voice_15_mr", + "common_voice_15_mt", + "common_voice_15_myv", + "common_voice_15_ne-NP", + "common_voice_15_nl", + "common_voice_15_nn-NO", + "common_voice_15_or", + "common_voice_15_pa-IN", + "common_voice_15_pl", + "common_voice_15_pt", + "common_voice_15_rm-sursilv", + "common_voice_15_rm-vallader", + "common_voice_15_ro", + "common_voice_15_ru", + "common_voice_15_rw", + "common_voice_15_sah", + "common_voice_15_sat", + "common_voice_15_sc", + "common_voice_15_sk", + "common_voice_15_sl", + "common_voice_15_sr", + "common_voice_15_sv-SE", + "common_voice_15_sw", + "common_voice_15_ta", + "common_voice_15_te", + "common_voice_15_tg", + "common_voice_15_th", + "common_voice_15_ti", + "common_voice_15_tok", + "common_voice_15_tr", + "common_voice_15_tt", + "common_voice_15_ug", + "common_voice_15_uk", + "common_voice_15_ur", + "common_voice_15_uz", + "common_voice_15_vot", + "common_voice_15_yi", + "common_voice_15_yue", + "common_voice_15_zh-CN", + "common_voice_15_zh-HK", + "common_voice_15_zh-TW", + "fleurs_af_za", + "fleurs_am_et", + "fleurs_ar_eg", + "fleurs_as_in", + "fleurs_ast_es", + "fleurs_az_az", + "fleurs_be_by", + "fleurs_bg_bg", + "fleurs_bn_in", + "fleurs_bs_ba", + "fleurs_ca_es", + "fleurs_ceb_ph", + "fleurs_ckb_iq", + "fleurs_cmn_hans_cn", + "fleurs_cs_cz", + "fleurs_cy_gb", + "fleurs_da_dk", + "fleurs_de_de", + "fleurs_el_gr", + "fleurs_en_us", + "fleurs_es_419", + "fleurs_et_ee", + "fleurs_fa_ir", + "fleurs_ff_sn", + "fleurs_fi_fi", + "fleurs_fil_ph", + "fleurs_fr_fr", + "fleurs_ga_ie", + "fleurs_gl_es", + "fleurs_gu_in", + "fleurs_ha_ng", + "fleurs_he_il", + "fleurs_hi_in", + "fleurs_hr_hr", + "fleurs_hu_hu", + "fleurs_hy_am", + "fleurs_id_id", + "fleurs_ig_ng", + "fleurs_is_is", + "fleurs_it_it", + "fleurs_ja_jp", + "fleurs_jv_id", + "fleurs_ka_ge", + "fleurs_kam_ke", + "fleurs_kea_cv", + "fleurs_kk_kz", + "fleurs_km_kh", + "fleurs_kn_in", + "fleurs_ko_kr", + "fleurs_ky_kg", + "fleurs_lb_lu", + "fleurs_lg_ug", + "fleurs_ln_cd", + "fleurs_lo_la", + "fleurs_lt_lt", + "fleurs_luo_ke", + "fleurs_lv_lv", + "fleurs_mi_nz", + "fleurs_mk_mk", + "fleurs_ml_in", + "fleurs_mn_mn", + "fleurs_mr_in", + "fleurs_ms_my", + "fleurs_mt_mt", + "fleurs_my_mm", + "fleurs_nb_no", + "fleurs_ne_np", + "fleurs_nl_nl", + "fleurs_nso_za", + "fleurs_ny_mw", + "fleurs_oc_fr", + "fleurs_om_et", + "fleurs_or_in", + "fleurs_pa_in", + "fleurs_pl_pl", + "fleurs_ps_af", + "fleurs_pt_br", + "fleurs_ro_ro", + "fleurs_ru_ru", + "fleurs_sd_in", + "fleurs_sk_sk", + "fleurs_sl_si", + "fleurs_sn_zw", + "fleurs_so_so", + "fleurs_sr_rs", + "fleurs_sv_se", + "fleurs_sw_ke", + "fleurs_ta_in", + "fleurs_te_in", + "fleurs_tg_tj", + "fleurs_th_th", + "fleurs_tr_tr", + "fleurs_uk_ua", + "fleurs_umb_ao", + "fleurs_ur_pk", + "fleurs_uz_uz", + "fleurs_vi_vn", + "fleurs_wo_sn", + "fleurs_xh_za", + "fleurs_yo_ng", + "fleurs_yue_hant_hk", + "fleurs_zu_za", + "gigaspeech_test", + "gigaspeech2_id_test", + "gigaspeech2_th_test", + "gigaspeech2_vi_test", + "librispeech_test_clean", + "librispeech_test_other", + "librispeech_multilingual_dutch", + "librispeech_multilingual_french", + "librispeech_multilingual_german", + "librispeech_multilingual_italian", + "librispeech_multilingual_polish", + "librispeech_multilingual_portuguese", + "librispeech_multilingual_spanish", + "mnsc_asr_part1_test", + "mnsc_asr_part2_test", + "mnsc_asr_part3_test", + "mnsc_asr_part4_test", + "mnsc_asr_part5_test", + "mnsc_asr_part6_test", + "peoples_speech_test", + "spgispeech_test", + "tedlium3_test", + "voxpopuli_cs", + "voxpopuli_de", + "voxpopuli_en", + "voxpopuli_en_accented", + "voxpopuli_es", + "voxpopuli_et", + "voxpopuli_fi", + "voxpopuli_fr", + "voxpopuli_hr", + "voxpopuli_hu", + "voxpopuli_it", + "voxpopuli_lt", + "voxpopuli_nl", + "voxpopuli_pl", + "voxpopuli_ro", + "voxpopuli_sk", + "voxpopuli_sl" + ], + "category": "speech_recognition" + }, + "code_switching_asr": { + "name": "Code Switching Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "seame_dev_man", + "seame_dev_sge" + ], + "category": "speech_recognition" + }, + "long_form_asr": { + "name": "Long Form Asr", + "metrics": [ + "word_error_rate" + ], + "configs": [ + "earnings21", + "earnings22", + "tedlium3_long_form" + ], + "category": "speech_recognition" + }, + "bfcl": { + "name": "Bfcl", + "metrics": [ + "bfcl_match_score" + ], + "configs": [ + "bfcl_audio_irrelevance", + "bfcl_audio_multiple", + "bfcl_audio_parallel", + "bfcl_audio_parallel_multiple", + "bfcl_audio_simple", + "bfcl_audio_irrelevance_no_prompt", + "bfcl_audio_multiple_no_prompt", + "bfcl_audio_parallel_multiple_no_prompt", + "bfcl_audio_parallel_no_prompt", + "bfcl_audio_simple_no_prompt", + "bfcl_text_irrelevance", + "bfcl_text_multiple", + "bfcl_text_parallel", + "bfcl_text_parallel_multiple", + "bfcl_text_simple", + "bfcl_text_irrelevance_no_prompt", + "bfcl_text_multiple_no_prompt", + "bfcl_text_parallel_multiple_no_prompt", + "bfcl_text_parallel_no_prompt", + "bfcl_text_simple_no_prompt" + ], + "category": "spoken_language_reasoning" + }, + "gsm8k": { + "name": "Gsm8K", + "metrics": [ + "gsm8k_exact_match" + ], + "configs": [ + "gsm8k_audio", + "gsm8k_text" + ], + "category": "spoken_language_reasoning" + }, + "ifeval": { + "name": "Ifeval", + "metrics": [ + "instruction_following" + ], + "configs": [ + "voicebench_ifeval_audio", + "voicebench_ifeval_text" + ], + "category": "spoken_language_reasoning" + }, + "mtbench": { + "name": "Mtbench", + "metrics": [ + "mt_bench_llm_judge" + ], + "configs": [ + "mtbench_audio", + "mtbench_text" + ], + "category": "spoken_language_reasoning" + }, + "speech_to_sql": { + "name": "Speech To Sql", + "metrics": [ + "sql_score" + ], + "configs": [ + "spider_audio", + "spider_text" + ], + "category": "spoken_language_reasoning" + }, + "intent_classification": { + "name": "Intent Classification", + "metrics": [ + "llm_judge_binary" + ], + "configs": [ + "SLURP-intent" + ], + "category": "spoken_language_understanding" + }, + "speech_qa": { + "name": "Speech Qa", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "alpaca_audio_test", + "cn_college_listen_mcq_test", + "dream_tts_mcq_test", + "mnsc_sqa_part3_test", + "mnsc_sqa_part4_test", + "mnsc_sqa_part5_test", + "mnsc_sqa_part6_test", + "openhermes_instruction_test", + "public_sg_speech_qa_test", + "slue_p2_sqa5_test", + "spoken_squad_test" + ], + "category": "spoken_language_understanding" + }, + "spoken_dialogue": { + "name": "Spoken Dialogue", + "metrics": [ + "joint_goal_accuracy", + "slot_accuracy", + "slot_f1", + "bleu" + ], + "configs": [ + "spokenwoz_audio", + "spokenwoz_text" + ], + "category": "spoken_language_understanding" + }, + "spoken_dialogue_summarization": { + "name": "Spoken Dialogue Summarization", + "metrics": [ + "llm_judge_detailed" + ], + "configs": [ + "mnsc_sds_part3_test", + "mnsc_sds_part4_test", + "mnsc_sds_part5_test" + ], + "category": "spoken_language_understanding" + }, + "sqqa": { + "name": "Sqqa", + "metrics": [ + "llm_judge_big_bench_audio" + ], + "configs": [ + "big_bench_audio_audio_query", + "big_bench_audio_text_query", + "mmsu_biology", + "mmsu_business", + "mmsu_chemistry", + "mmsu_economics", + "mmsu_engineering", + "mmsu_health", + "mmsu_history", + "mmsu_law", + "mmsu_other", + "mmsu_philosophy", + "mmsu_physics", + "mmsu_psychology", + "openbookqa", + "sd-qa_aus_audio", + "sd-qa_aus_text", + "sd-qa_gbr_audio", + "sd-qa_gbr_text", + "sd-qa_ind_n_audio", + "sd-qa_ind_n_text", + "sd-qa_ind_s_audio", + "sd-qa_ind_s_text", + "sd-qa_irl_audio", + "sd-qa_irl_text", + "sd-qa_kenya_audio", + "sd-qa_kenya_text", + "sd-qa_nga_audio", + "sd-qa_nga_text", + "sd-qa_nzl_audio", + "sd-qa_nzl_text", + "sd-qa_phl_audio", + "sd-qa_phl_text", + "sd-qa_usa_audio", + "sd-qa_usa_text", + "sd-qa_zaf_audio", + "sd-qa_zaf_text" + ], + "category": "spoken_language_understanding" + }, + "translation": { + "name": "Translation", + "metrics": [ + "bleu", + "meteor", + "bertscore", + "comet" + ], + "configs": [ + "covost2_ar_en", + "covost2_ca_en", + "covost2_cy_en", + "covost2_de_en", + "covost2_en_ar", + "covost2_en_ca", + "covost2_en_cy", + "covost2_en_de", + "covost2_en_et", + "covost2_en_fa", + "covost2_en_id", + "covost2_en_ja", + "covost2_en_lv", + "covost2_en_mn", + "covost2_en_sl", + "covost2_en_sv-SE", + "covost2_en_ta", + "covost2_en_tr", + "covost2_en_zh-CN", + "covost2_es_en", + "covost2_et_en", + "covost2_fa_en", + "covost2_fr_en", + "covost2_id_en", + "covost2_it_en", + "covost2_ja_en", + "covost2_lv_en", + "covost2_mn_en", + "covost2_nl_en", + "covost2_pt_en", + "covost2_ru_en", + "covost2_sl_en", + "covost2_sv-SE_en", + "covost2_ta_en", + "covost2_tr_en", + "covost2_zh-CN_en" + ], + "category": "spoken_language_understanding" + } +} \ No newline at end of file