From 77cff00ca5ba096e7aa735e5c87896ec4345abd3 Mon Sep 17 00:00:00 2001
From: Kostis <Kostis-S-Z@users.noreply.github.com>
Date: Thu, 30 Jan 2025 21:06:22 +0200
Subject: [PATCH] Add config (#19)

* [WIP] add UI config for finetuning in notebook

* Add hf reference

* Add config file using pydantic and yaml

* Lint

* Fix yaml issues

* Convert TrainingConfig to dict

* Fix EOF for yaml

* Fix attribute name

* Update import paths

* Use os.cpu_count() instead of config arg
---
 demo/notebook.ipynb                           |  31 ++---
 src/speech_to_text_finetune/config.py         |  53 +++++++++
 src/speech_to_text_finetune/config.yaml       |  24 ++++
 src/speech_to_text_finetune/data_process.py   |   4 +-
 .../finetune_whisper.py                       | 110 ++++++------------
 src/speech_to_text_finetune/hf_utils.py       |   4 +-
 6 files changed, 128 insertions(+), 98 deletions(-)
 create mode 100644 src/speech_to_text_finetune/config.py
 create mode 100644 src/speech_to_text_finetune/config.yaml

diff --git a/demo/notebook.ipynb b/demo/notebook.ipynb
index ff8f467..922267d 100644
--- a/demo/notebook.ipynb
+++ b/demo/notebook.ipynb
@@ -113,14 +113,12 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fcebc6a2-b74c-47b4-bd94-b945bef9177f",
    "metadata": {},
+   "cell_type": "code",
    "outputs": [],
-   "source": [
-    "from src.speech_to_text_finetune.finetune_whisper import run_finetuning"
-   ]
+   "execution_count": null,
+   "source": "from speech_to_text_finetune.finetune_whisper import run_finetuning",
+   "id": "da07095b78eba3c0"
   },
   {
    "cell_type": "markdown",
@@ -139,7 +137,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_id = \"openai/whisper-tiny\"\n",
+    "# @title Finetuning configuration and hyperparameter setting\n",
+    "\n",
+    "model_id = \"openai/whisper-tiny\"  # @ [\"openai/whisper-tiny\", \"openai/whisper-small\", \"openai/whisper-medium\"]\n",
     "dataset_id = \"mozilla-foundation/common_voice_17_0\"\n",
     "language = \"Greek\"\n",
     "\n",
@@ -161,21 +161,12 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3e885747a52fc2b3",
    "metadata": {},
+   "cell_type": "code",
    "outputs": [],
-   "source": [
-    "run_finetuning(\n",
-    "    model_id=model_id,\n",
-    "    dataset_id=dataset_id,\n",
-    "    language=language,\n",
-    "    repo_name=repo_name,\n",
-    "    max_steps=test_max_steps,\n",
-    "    private_hf_repo=make_repo_private,\n",
-    ")"
-   ]
+   "execution_count": null,
+   "source": "run_finetuning(config_path=\"src/speech_to_text_finetune/config.yaml\")",
+   "id": "73ef3bd5de291da3"
   }
  ],
  "metadata": {
diff --git a/src/speech_to_text_finetune/config.py b/src/speech_to_text_finetune/config.py
new file mode 100644
index 0000000..056eb95
--- /dev/null
+++ b/src/speech_to_text_finetune/config.py
@@ -0,0 +1,53 @@
+import yaml
+from pydantic import BaseModel
+
+
+def load_config(config_path: str):
+    with open(config_path, "r") as file:
+        config_dict = yaml.safe_load(file)
+
+    return Config(**config_dict)
+
+
+class TrainingConfig(BaseModel):
+    """
+    More info at https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
+    """
+
+    push_to_hub: bool
+    hub_private_repo: bool
+    max_steps: int
+    per_device_train_batch_size: int
+    gradient_accumulation_steps: int
+    learning_rate: float
+    warmup_steps: int
+    gradient_checkpointing: bool
+    fp16: bool
+    eval_strategy: str
+    per_device_eval_batch_size: int
+    predict_with_generate: bool
+    generation_max_length: int
+    save_steps: int
+    logging_steps: int
+    load_best_model_at_end: bool
+    metric_for_best_model: str
+    greater_is_better: bool
+
+
+class Config(BaseModel):
+    """
+    Store configuration used for finetuning
+
+    Args:
+        model_id (str): HF model id of a Whisper model used for finetuning
+        dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
+        language (str): registered language string that is supported by the Common Voice dataset
+        repo_name (str | None): used both for local dir and HF, None will create a name based on the model and language id
+        training_hp (TrainingConfig): store selective hyperparameter values from Seq2SeqTrainingArguments
+    """
+
+    model_id: str
+    dataset_id: str
+    language: str
+    repo_name: str | None
+    training_hp: TrainingConfig
diff --git a/src/speech_to_text_finetune/config.yaml b/src/speech_to_text_finetune/config.yaml
new file mode 100644
index 0000000..c9af007
--- /dev/null
+++ b/src/speech_to_text_finetune/config.yaml
@@ -0,0 +1,24 @@
+model_id: openai/whisper-tiny
+dataset_id: mozilla-foundation/common_voice_17_0
+language: Greek
+repo_name: None
+
+training_hp:
+    push_to_hub: False
+    hub_private_repo: True
+    max_steps: 1
+    per_device_train_batch_size: 64
+    gradient_accumulation_steps: 1
+    learning_rate: 1e-5
+    warmup_steps: 50
+    gradient_checkpointing: True
+    fp16: True
+    eval_strategy: steps
+    per_device_eval_batch_size: 8
+    predict_with_generate: True
+    generation_max_length: 225
+    save_steps: 250
+    logging_steps: 25
+    load_best_model_at_end: True
+    metric_for_best_model: wer
+    greater_is_better: False
diff --git a/src/speech_to_text_finetune/data_process.py b/src/speech_to_text_finetune/data_process.py
index 73087eb..9edb589 100644
--- a/src/speech_to_text_finetune/data_process.py
+++ b/src/speech_to_text_finetune/data_process.py
@@ -1,3 +1,5 @@
+import os
+
 import torch
 from dataclasses import dataclass
 from typing import Dict, List, Union
@@ -60,7 +62,7 @@ def process_dataset(
         _process_inputs_and_labels_for_whisper,
         fn_kwargs={"feature_extractor": feature_extractor, "tokenizer": tokenizer},
         remove_columns=dataset.column_names["train"],
-        num_proc=2,
+        num_proc=os.cpu_count(),
     )
     return dataset
 
diff --git a/src/speech_to_text_finetune/finetune_whisper.py b/src/speech_to_text_finetune/finetune_whisper.py
index 65e92c2..563d07e 100644
--- a/src/speech_to_text_finetune/finetune_whisper.py
+++ b/src/speech_to_text_finetune/finetune_whisper.py
@@ -14,81 +14,66 @@
 import evaluate
 from evaluate import EvaluationModule
 from loguru import logger
-from src.speech_to_text_finetune.data_process import (
+
+from speech_to_text_finetune.config import load_config
+from speech_to_text_finetune.data_process import (
     load_common_voice,
     DataCollatorSpeechSeq2SeqWithPadding,
     process_dataset,
 )
-from src.speech_to_text_finetune.hf_utils import (
+from speech_to_text_finetune.hf_utils import (
     get_hf_username,
     upload_custom_hf_model_card,
     get_available_languages_in_cv,
 )
 
-hf_username = get_hf_username()
-dataset_id_cv = "mozilla-foundation/common_voice_17_0"
-model_id_whisper = "openai/whisper-tiny"
-test_language = "Greek"
-
-test_repo_name = "testing"  # None for default name, or set your own
-test_max_steps = 100
-push_to_hf = True
-make_repo_private = False
-
-
-def run_finetuning(
-    model_id: str,
-    dataset_id: str,
-    language: str,
-    repo_name: str | None,
-    max_steps: int = 2000,
-    private_hf_repo: bool = True,
-) -> Tuple[Dict, Dict]:
+
+def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
     """
     Complete pipeline for preprocessing the Common Voice dataset and then finetuning a Whisper model on it.
 
     Args:
-        model_id (str): HF model id of a Whisper model used for finetuning
-        dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
-        language (str): registered language string that is supported by the Common Voice dataset
-        repo_name (str): repo ID that will be used for storing artifacts both locally and on HF
-        max_steps (int): number of steps to run the training job, defaults to 2000
-        private_hf_repo (bool): flag whether to make the HF public (False) or private (True)
+        config_path (str): The filepath to a yaml file that follows the format defined in config.py
 
     Returns:
         Tuple[Dict, Dict]: evaluation metrics from the baseline and the finetuned models
     """
+    cfg = load_config(config_path)
 
-    languages_name_to_id = get_available_languages_in_cv(dataset_id)
-    language_id = languages_name_to_id[language]
+    hf_username = get_hf_username()
 
-    if not repo_name:
-        repo_name = f"{model_id.split('/')[1]}-{language_id}"
-    hf_repo_name = f"{hf_username}/{repo_name}"
-    local_output_dir = f"./artifacts/{repo_name}"
+    languages_name_to_id = get_available_languages_in_cv(cfg.dataset_id)
+    language_id = languages_name_to_id[cfg.language]
+
+    if not cfg.repo_name:
+        cfg.repo_name = f"{cfg.model.model_id.split('/')[1]}-{language_id}"
+    hf_repo_name = f"{hf_username}/{cfg.repo_name}"
+    local_output_dir = f"./artifacts/{cfg.repo_name}"
 
     logger.info(
         f"Finetuning job will soon start. "
         f"Results will be saved local at {local_output_dir} uploaded in HF at {hf_repo_name}. "
-        f"Private repo is set to {private_hf_repo}."
+        f"Private repo is set to {cfg.training_hp.hub_private_repo}."
     )
 
-    logger.info(f"Loading the {language} subset from the {dataset_id} dataset.")
-    dataset = load_common_voice(dataset_id, language_id)
+    logger.info(f"Loading the {cfg.language} subset from the {cfg.dataset_id} dataset.")
+    dataset = load_common_voice(cfg.dataset_id, language_id)
 
     device = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
 
-    logger.info(f"Loading {model_id} on {device} and configuring it for {language}.")
-    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
+    logger.info(
+        f"Loading {cfg.model_id} on {device} and configuring it for {cfg.language}."
+    )
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(cfg.model_id)
     tokenizer = WhisperTokenizer.from_pretrained(
-        model_id, language=language, task="transcribe"
+        cfg.model_id, language=cfg.language, task="transcribe"
     )
     processor = WhisperProcessor.from_pretrained(
-        model_id, language=language, task="transcribe"
+        cfg.model_id, language=cfg.language, task="transcribe"
     )
-    model = WhisperForConditionalGeneration.from_pretrained(model_id)
+    model = WhisperForConditionalGeneration.from_pretrained(cfg.model_id)
 
-    model.generation_config.language = language.lower()
+    model.generation_config.language = cfg.language.lower()
     model.generation_config.task = "transcribe"
     model.generation_config.forced_decoder_ids = None
 
@@ -102,27 +87,9 @@ def run_finetuning(
 
     training_args = Seq2SeqTrainingArguments(
         output_dir=local_output_dir,
-        per_device_train_batch_size=64,
-        gradient_accumulation_steps=1,
-        learning_rate=1e-5,
-        warmup_steps=50,
-        max_steps=max_steps,
-        gradient_checkpointing=True,
-        fp16=True,
-        eval_strategy="steps",
-        per_device_eval_batch_size=8,
-        predict_with_generate=True,
-        generation_max_length=225,
-        save_steps=250,
-        eval_steps=250,
-        logging_steps=25,
-        load_best_model_at_end=True,
-        metric_for_best_model="wer",
-        greater_is_better=False,
-        report_to=["tensorboard"],
-        push_to_hub=push_to_hf,
         hub_model_id=hf_repo_name,
-        hub_private_repo=private_hf_repo,
+        report_to=["tensorboard"],
+        **cfg.training_hp.dict(),
     )
 
     metric = evaluate.load("wer")
@@ -142,7 +109,7 @@ def run_finetuning(
     processor.save_pretrained(training_args.output_dir)
 
     logger.info(
-        f"Before finetuning, run evaluation on the baseline model {model_id} to easily compare performance"
+        f"Before finetuning, run evaluation on the baseline model {cfg.model_id} to easily compare performance"
         f" before and after finetuning"
     )
     baseline_eval_results = trainer.evaluate()
@@ -159,15 +126,15 @@ def run_finetuning(
     eval_results = trainer.evaluate()
     logger.info(f"Evaluation complete. Results:\n\t {eval_results}")
 
-    if push_to_hf:
+    if cfg.training_hp.push_to_hf:
         logger.info(f"Uploading model and eval results to HuggingFace: {hf_repo_name}")
         trainer.push_to_hub()
         upload_custom_hf_model_card(
             hf_repo_name=hf_repo_name,
-            model_id=model_id,
-            dataset_id=dataset_id,
+            model_id=cfg.model_id,
+            dataset_id=cfg.dataset_id,
             language_id=language_id,
-            language=language,
+            language=cfg.language,
             n_train_samples=dataset["train"].num_rows,
             n_eval_samples=dataset["test"].num_rows,
             baseline_eval_results=baseline_eval_results,
@@ -216,11 +183,4 @@ def compute_word_error_rate(
 
 
 if __name__ == "__main__":
-    run_finetuning(
-        model_id=model_id_whisper,
-        dataset_id=dataset_id_cv,
-        language=test_language,
-        repo_name=test_repo_name,
-        max_steps=test_max_steps,
-        private_hf_repo=make_repo_private,
-    )
+    run_finetuning(config_path="src/speech_to_text_finetune/config.yaml")
diff --git a/src/speech_to_text_finetune/hf_utils.py b/src/speech_to_text_finetune/hf_utils.py
index f6a53f7..e2879b7 100644
--- a/src/speech_to_text_finetune/hf_utils.py
+++ b/src/speech_to_text_finetune/hf_utils.py
@@ -72,8 +72,8 @@ def upload_custom_hf_model_card(
     ft_eval_results: Dict,
 ) -> None:
     """
-    Create and upload a custom Model Card (<TODO: hf reference here>) to the Hugging Face repo of the finetuned model
-    that highlights the evaluation results before and after finetuning.
+    Create and upload a custom Model Card (https://huggingface.co/docs/hub/model-cards) to the Hugging Face repo
+    of the finetuned model that highlights the evaluation results before and after finetuning.
     """
 
     card_metadata = ModelCardData(