From fed0383c42db7a11fcbc44a65aec1a4e214c89e1 Mon Sep 17 00:00:00 2001 From: skyil7 Date: Mon, 24 Mar 2025 17:32:07 +0900 Subject: [PATCH] Feat: Add normalize option to CER and WER metrics for normalized score calculation --- metrics/cer/README.md | 15 ++++++++++++--- metrics/cer/cer.py | 8 ++++++-- metrics/wer/README.md | 28 +++++++++++++++++++++++++++- metrics/wer/wer.py | 8 ++++++-- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/metrics/cer/README.md b/metrics/cer/README.md index 1ea4109cc..b96917c75 100644 --- a/metrics/cer/README.md +++ b/metrics/cer/README.md @@ -56,12 +56,15 @@ where ## How to use -The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score). +The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score). You can also set `normalize=True` to obtain a normalized CER value. ```python from evaluate import load cer = load("cer") +# Standard CER calculation cer_score = cer.compute(predictions=predictions, references=references) +# Normalized CER calculation +normalized_cer_score = cer.compute(predictions=predictions, references=references, normalize=True) ``` ## Output values @@ -74,7 +77,9 @@ print(cer_score) The **lower** the CER value, the **better** the performance of the ASR system, with a CER of 0 being a perfect score. -However, CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions (see [Examples](#Examples) below). +When using the default settings, CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions (see [Examples](#Examples) below). + +When using `normalize=True`, the CER is calculated as `(S + D + I) / (S + D + I + C)`, which ensures the output always falls within the range of 0-1 (or 0-100%). ### Values from popular papers @@ -130,13 +135,17 @@ references = ["hello"] cer_score = cer.compute(predictions=predictions, references=references) print(cer_score) 1.2 +# With normalization +normalized_cer_score = cer.compute(predictions=predictions, references=references, normalize=True) +print(normalized_cer_score) +0.54545454545454545 # Will always be between 0 and 1 ``` ## Limitations and bias CER is useful for comparing different models for tasks such as automatic speech recognition (ASR) and optic character recognition (OCR), especially for multilingual datasets where WER is not suitable given the diversity of languages. However, CER provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort. -Also, in some cases, instead of reporting the raw CER, a normalized CER is reported where the number of mistakes is divided by the sum of the number of edit operations (`I` + `S` + `D`) and `C` (the number of correct characters), which results in CER values that fall within the range of 0–100%. +The raw CER can exceed 1.0 when there are many insertion errors. To address this, you can use the `normalize=True` parameter to calculate a normalized CER where the number of errors is divided by the sum of the number of edit operations (`I` + `S` + `D`) and `C` (the number of correct characters), which results in CER values that fall within the range of 0–1 (or 0–100%). ## Citation diff --git a/metrics/cer/cer.py b/metrics/cer/cer.py index c5f4a9072..6ddea41aa 100644 --- a/metrics/cer/cer.py +++ b/metrics/cer/cer.py @@ -101,6 +101,7 @@ def process_list(self, inp: List[str]): references: list of references for each speech input. predictions: list of transcribtions to score. concatenate_texts: Whether or not to concatenate sentences before evaluation, set to True for more accurate result. + normalize: Whether to normalize the CER score. If set to True, the number of mistakes is divided by the sum of the number of edit operations (insertions + substitutions + deletions) and correct characters, which results in CER values that fall within the range of 0-100%. Returns: (float): the character error rate @@ -135,7 +136,7 @@ def _info(self): ], ) - def _compute(self, predictions, references, concatenate_texts=False): + def _compute(self, predictions, references, concatenate_texts=False, normalize=False): if concatenate_texts: return jiwer.compute_measures( references, @@ -154,6 +155,9 @@ def _compute(self, predictions, references, concatenate_texts=False): hypothesis_transform=cer_transform, ) incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"] - total += measures["substitutions"] + measures["deletions"] + measures["hits"] + if normalize: + total += measures["substitutions"] + measures["deletions"] + measures["insertions"] + measures["hits"] + else: + total += measures["substitutions"] + measures["deletions"] + measures["hits"] return incorrect / total diff --git a/metrics/wer/README.md b/metrics/wer/README.md index 21f83429f..fef4308f7 100644 --- a/metrics/wer/README.md +++ b/metrics/wer/README.md @@ -61,13 +61,17 @@ where ## How to use -The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score). +The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score). You can also set `normalize=True` to obtain a normalized WER value. ```python from evaluate import load +# Standard WER calculation wer = load("wer") wer_score = wer.compute(predictions=predictions, references=references) +# Normalized WER calculation +normalized_wer_score = wer.compute(predictions=predictions, references=references, normalize=True) + ``` ## Output values @@ -82,6 +86,10 @@ This value indicates the average number of errors per reference word. The **lower** the value, the **better** the performance of the ASR system, with a WER of 0 being a perfect score. +When using the default settings, WER's output is not always a number between 0 and 1, in particular when there is a high number of insertions (see [Examples](#Examples) below). + +When using `normalize=True`, the WER is calculated as `(S + D + I) / (S + D + I + C)`, which ensures the output always falls within the range of 0-1 (or 0-100%). + ### Values from popular papers This metric is highly dependent on the content and quality of the dataset, and therefore users can expect very different values for the same model but on different datasets. @@ -127,10 +135,28 @@ print(wer_score) 1.0 ``` +WER above 1 due to insertion errors: + +```python +from evaluate import load +wer = load("wer") +predictions = ["hello wonderful world and all the people in it"] +references = ["hello world"] +wer_score = wer.compute(predictions=predictions, references=references) +print(wer_score) +3.5 +# With normalization +normalized_wer_score = wer.compute(predictions=predictions, references=references, normalize=True) +print(normalized_wer_score) +0.7777777777777778 +``` + ## Limitations and bias WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort. +The raw WER can exceed 1.0 when there are many insertion errors. To address this, you can use the `normalize=True` parameter to calculate a normalized WER where the number of errors is divided by the sum of the number of edit operations (`I` + `S` + `D`) and `C` (the number of correct words), which results in WER values that fall within the range of 0–1 (or 0–100%). + ## Citation ```bibtex diff --git a/metrics/wer/wer.py b/metrics/wer/wer.py index 214d5b22e..526ce51f2 100644 --- a/metrics/wer/wer.py +++ b/metrics/wer/wer.py @@ -59,6 +59,7 @@ references: List of references for each speech input. predictions: List of transcriptions to score. concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively. + normalize: Whether to normalize the WER score. If set to True, the number of mistakes is divided by the sum of the number of edit operations (insertions + substitutions + deletions) and correct characters, which results in CER values that fall within the range of 0-100%. Returns: (float): the word error rate @@ -93,7 +94,7 @@ def _info(self): ], ) - def _compute(self, predictions=None, references=None, concatenate_texts=False): + def _compute(self, predictions=None, references=None, concatenate_texts=False, normalize=False): if concatenate_texts: return compute_measures(references, predictions)["wer"] else: @@ -102,5 +103,8 @@ def _compute(self, predictions=None, references=None, concatenate_texts=False): for prediction, reference in zip(predictions, references): measures = compute_measures(reference, prediction) incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"] - total += measures["substitutions"] + measures["deletions"] + measures["hits"] + if normalize: + total += measures["substitutions"] + measures["deletions"] + measures["insertions"] + measures["hits"] + else: + total += measures["substitutions"] + measures["deletions"] + measures["hits"] return incorrect / total