Skip to content

Commit

Permalink
Add config (#19)
Browse files Browse the repository at this point in the history
* [WIP] add UI config for finetuning in notebook

* Add hf reference

* Add config file using pydantic and yaml

* Lint

* Fix yaml issues

* Convert TrainingConfig to dict

* Fix EOF for yaml

* Fix attribute name

* Update import paths

* Use os.cpu_count() instead of config arg
Kostis-S-Z authored Jan 30, 2025
1 parent ea5a1ae commit 77cff00
Showing 6 changed files with 128 additions and 98 deletions.
31 changes: 11 additions & 20 deletions demo/notebook.ipynb
Original file line number Diff line number Diff line change
@@ -113,14 +113,12 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcebc6a2-b74c-47b4-bd94-b945bef9177f",
"metadata": {},
"cell_type": "code",
"outputs": [],
"source": [
"from src.speech_to_text_finetune.finetune_whisper import run_finetuning"
]
"execution_count": null,
"source": "from speech_to_text_finetune.finetune_whisper import run_finetuning",
"id": "da07095b78eba3c0"
},
{
"cell_type": "markdown",
@@ -139,7 +137,9 @@
"metadata": {},
"outputs": [],
"source": [
"model_id = \"openai/whisper-tiny\"\n",
"# @title Finetuning configuration and hyperparameter setting\n",
"\n",
"model_id = \"openai/whisper-tiny\" # @ [\"openai/whisper-tiny\", \"openai/whisper-small\", \"openai/whisper-medium\"]\n",
"dataset_id = \"mozilla-foundation/common_voice_17_0\"\n",
"language = \"Greek\"\n",
"\n",
@@ -161,21 +161,12 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e885747a52fc2b3",
"metadata": {},
"cell_type": "code",
"outputs": [],
"source": [
"run_finetuning(\n",
" model_id=model_id,\n",
" dataset_id=dataset_id,\n",
" language=language,\n",
" repo_name=repo_name,\n",
" max_steps=test_max_steps,\n",
" private_hf_repo=make_repo_private,\n",
")"
]
"execution_count": null,
"source": "run_finetuning(config_path=\"src/speech_to_text_finetune/config.yaml\")",
"id": "73ef3bd5de291da3"
}
],
"metadata": {
53 changes: 53 additions & 0 deletions src/speech_to_text_finetune/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import yaml
from pydantic import BaseModel


def load_config(config_path: str):
with open(config_path, "r") as file:
config_dict = yaml.safe_load(file)

return Config(**config_dict)


class TrainingConfig(BaseModel):
"""
More info at https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
"""

push_to_hub: bool
hub_private_repo: bool
max_steps: int
per_device_train_batch_size: int
gradient_accumulation_steps: int
learning_rate: float
warmup_steps: int
gradient_checkpointing: bool
fp16: bool
eval_strategy: str
per_device_eval_batch_size: int
predict_with_generate: bool
generation_max_length: int
save_steps: int
logging_steps: int
load_best_model_at_end: bool
metric_for_best_model: str
greater_is_better: bool


class Config(BaseModel):
"""
Store configuration used for finetuning
Args:
model_id (str): HF model id of a Whisper model used for finetuning
dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
language (str): registered language string that is supported by the Common Voice dataset
repo_name (str | None): used both for local dir and HF, None will create a name based on the model and language id
training_hp (TrainingConfig): store selective hyperparameter values from Seq2SeqTrainingArguments
"""

model_id: str
dataset_id: str
language: str
repo_name: str | None
training_hp: TrainingConfig
24 changes: 24 additions & 0 deletions src/speech_to_text_finetune/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
model_id: openai/whisper-tiny
dataset_id: mozilla-foundation/common_voice_17_0
language: Greek
repo_name: None

training_hp:
push_to_hub: False
hub_private_repo: True
max_steps: 1
per_device_train_batch_size: 64
gradient_accumulation_steps: 1
learning_rate: 1e-5
warmup_steps: 50
gradient_checkpointing: True
fp16: True
eval_strategy: steps
per_device_eval_batch_size: 8
predict_with_generate: True
generation_max_length: 225
save_steps: 250
logging_steps: 25
load_best_model_at_end: True
metric_for_best_model: wer
greater_is_better: False
4 changes: 3 additions & 1 deletion src/speech_to_text_finetune/data_process.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

import torch
from dataclasses import dataclass
from typing import Dict, List, Union
@@ -60,7 +62,7 @@ def process_dataset(
_process_inputs_and_labels_for_whisper,
fn_kwargs={"feature_extractor": feature_extractor, "tokenizer": tokenizer},
remove_columns=dataset.column_names["train"],
num_proc=2,
num_proc=os.cpu_count(),
)
return dataset

110 changes: 35 additions & 75 deletions src/speech_to_text_finetune/finetune_whisper.py
Original file line number Diff line number Diff line change
@@ -14,81 +14,66 @@
import evaluate
from evaluate import EvaluationModule
from loguru import logger
from src.speech_to_text_finetune.data_process import (

from speech_to_text_finetune.config import load_config
from speech_to_text_finetune.data_process import (
load_common_voice,
DataCollatorSpeechSeq2SeqWithPadding,
process_dataset,
)
from src.speech_to_text_finetune.hf_utils import (
from speech_to_text_finetune.hf_utils import (
get_hf_username,
upload_custom_hf_model_card,
get_available_languages_in_cv,
)

hf_username = get_hf_username()
dataset_id_cv = "mozilla-foundation/common_voice_17_0"
model_id_whisper = "openai/whisper-tiny"
test_language = "Greek"

test_repo_name = "testing" # None for default name, or set your own
test_max_steps = 100
push_to_hf = True
make_repo_private = False


def run_finetuning(
model_id: str,
dataset_id: str,
language: str,
repo_name: str | None,
max_steps: int = 2000,
private_hf_repo: bool = True,
) -> Tuple[Dict, Dict]:

def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
"""
Complete pipeline for preprocessing the Common Voice dataset and then finetuning a Whisper model on it.
Args:
model_id (str): HF model id of a Whisper model used for finetuning
dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
language (str): registered language string that is supported by the Common Voice dataset
repo_name (str): repo ID that will be used for storing artifacts both locally and on HF
max_steps (int): number of steps to run the training job, defaults to 2000
private_hf_repo (bool): flag whether to make the HF public (False) or private (True)
config_path (str): The filepath to a yaml file that follows the format defined in config.py
Returns:
Tuple[Dict, Dict]: evaluation metrics from the baseline and the finetuned models
"""
cfg = load_config(config_path)

languages_name_to_id = get_available_languages_in_cv(dataset_id)
language_id = languages_name_to_id[language]
hf_username = get_hf_username()

if not repo_name:
repo_name = f"{model_id.split('/')[1]}-{language_id}"
hf_repo_name = f"{hf_username}/{repo_name}"
local_output_dir = f"./artifacts/{repo_name}"
languages_name_to_id = get_available_languages_in_cv(cfg.dataset_id)
language_id = languages_name_to_id[cfg.language]

if not cfg.repo_name:
cfg.repo_name = f"{cfg.model.model_id.split('/')[1]}-{language_id}"
hf_repo_name = f"{hf_username}/{cfg.repo_name}"
local_output_dir = f"./artifacts/{cfg.repo_name}"

logger.info(
f"Finetuning job will soon start. "
f"Results will be saved local at {local_output_dir} uploaded in HF at {hf_repo_name}. "
f"Private repo is set to {private_hf_repo}."
f"Private repo is set to {cfg.training_hp.hub_private_repo}."
)

logger.info(f"Loading the {language} subset from the {dataset_id} dataset.")
dataset = load_common_voice(dataset_id, language_id)
logger.info(f"Loading the {cfg.language} subset from the {cfg.dataset_id} dataset.")
dataset = load_common_voice(cfg.dataset_id, language_id)

device = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"

logger.info(f"Loading {model_id} on {device} and configuring it for {language}.")
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
logger.info(
f"Loading {cfg.model_id} on {device} and configuring it for {cfg.language}."
)
feature_extractor = WhisperFeatureExtractor.from_pretrained(cfg.model_id)
tokenizer = WhisperTokenizer.from_pretrained(
model_id, language=language, task="transcribe"
cfg.model_id, language=cfg.language, task="transcribe"
)
processor = WhisperProcessor.from_pretrained(
model_id, language=language, task="transcribe"
cfg.model_id, language=cfg.language, task="transcribe"
)
model = WhisperForConditionalGeneration.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(cfg.model_id)

model.generation_config.language = language.lower()
model.generation_config.language = cfg.language.lower()
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

@@ -102,27 +87,9 @@ def run_finetuning(

training_args = Seq2SeqTrainingArguments(
output_dir=local_output_dir,
per_device_train_batch_size=64,
gradient_accumulation_steps=1,
learning_rate=1e-5,
warmup_steps=50,
max_steps=max_steps,
gradient_checkpointing=True,
fp16=True,
eval_strategy="steps",
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=225,
save_steps=250,
eval_steps=250,
logging_steps=25,
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
report_to=["tensorboard"],
push_to_hub=push_to_hf,
hub_model_id=hf_repo_name,
hub_private_repo=private_hf_repo,
report_to=["tensorboard"],
**cfg.training_hp.dict(),
)

metric = evaluate.load("wer")
@@ -142,7 +109,7 @@ def run_finetuning(
processor.save_pretrained(training_args.output_dir)

logger.info(
f"Before finetuning, run evaluation on the baseline model {model_id} to easily compare performance"
f"Before finetuning, run evaluation on the baseline model {cfg.model_id} to easily compare performance"
f" before and after finetuning"
)
baseline_eval_results = trainer.evaluate()
@@ -159,15 +126,15 @@ def run_finetuning(
eval_results = trainer.evaluate()
logger.info(f"Evaluation complete. Results:\n\t {eval_results}")

if push_to_hf:
if cfg.training_hp.push_to_hf:
logger.info(f"Uploading model and eval results to HuggingFace: {hf_repo_name}")
trainer.push_to_hub()
upload_custom_hf_model_card(
hf_repo_name=hf_repo_name,
model_id=model_id,
dataset_id=dataset_id,
model_id=cfg.model_id,
dataset_id=cfg.dataset_id,
language_id=language_id,
language=language,
language=cfg.language,
n_train_samples=dataset["train"].num_rows,
n_eval_samples=dataset["test"].num_rows,
baseline_eval_results=baseline_eval_results,
@@ -216,11 +183,4 @@ def compute_word_error_rate(


if __name__ == "__main__":
run_finetuning(
model_id=model_id_whisper,
dataset_id=dataset_id_cv,
language=test_language,
repo_name=test_repo_name,
max_steps=test_max_steps,
private_hf_repo=make_repo_private,
)
run_finetuning(config_path="src/speech_to_text_finetune/config.yaml")
4 changes: 2 additions & 2 deletions src/speech_to_text_finetune/hf_utils.py
Original file line number Diff line number Diff line change
@@ -72,8 +72,8 @@ def upload_custom_hf_model_card(
ft_eval_results: Dict,
) -> None:
"""
Create and upload a custom Model Card (<TODO: hf reference here>) to the Hugging Face repo of the finetuned model
that highlights the evaluation results before and after finetuning.
Create and upload a custom Model Card (https://huggingface.co/docs/hub/model-cards) to the Hugging Face repo
of the finetuned model that highlights the evaluation results before and after finetuning.
"""

card_metadata = ModelCardData(

0 comments on commit 77cff00

Please sign in to comment.