diff --git a/speech2text/accuracy_eval_server.py b/speech2text/accuracy_eval_server.py new file mode 100644 index 0000000000..30c50fd281 --- /dev/null +++ b/speech2text/accuracy_eval_server.py @@ -0,0 +1,154 @@ +# Copyright 2025 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import argparse +import array +import json +import sys +import os +from typing import List + +from whisper.normalizers import EnglishTextNormalizer + +from manifest import Manifest +from legacy_helpers import __levenshtein, __gather_predictions +from helpers import compute_wer_with_concatenation, get_expanded_wordlist, assemble_stream + + +max_duration = float(os.environ.get("MAX_DURATION", "30.0")) +labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'"] +dtype_map = { + "int8": 'b', + "int16": 'h', + "int32": 'l', + "int64": 'q', +} + + +def word_error_rate(hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + normalizer = EnglishTextNormalizer() + + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError("In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references))) + for h, r in zip(hypotheses, references): + h = normalizer(h) + r = normalizer(r) + h_list = h.split() + r_list = r.split() + scores_clip, words_clip = compute_wer_with_concatenation( + h_list, r_list) + scores += scores_clip + words += words_clip + wer = scores / words + return wer, scores, words + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", required=True) + parser.add_argument("--dataset_dir", required=True) + parser.add_argument("--manifest", required=True) + parser.add_argument( + "--output_dtype", + default="int64", + choices=dtype_map.keys(), + help="Output data type") + args = parser.parse_args() + return args + + +def main(): + args = get_args() + manifest = Manifest(args.dataset_dir, + [args.manifest], + labels, + len(labels), + max_duration=max_duration) + with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: + results = json.load(fh) + + h_catalog = dict() + r_catalog = dict() + for result in results: + file_base = os.path.basename(manifest[result["qsl_idx"]]["audio_filepath"][0]).split('.')[0] + key = str(file_base.split('_')[0]) + index = int(file_base.split('_')[1]) + h = array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist() + h = __gather_predictions([[h]], labels=labels) + h_tuple = (index, h[0]) + r = manifest[result["qsl_idx"]]["transcript"] + r = __gather_predictions([[r]], labels=labels) + if key in h_catalog: + h_catalog[key].append(h_tuple) + else: + h_catalog[key] = [] + r_catalog[key] = r[0] + + hypotheses = [] + references = [] + for key in h_catalog.keys(): + h_catalog[key] = sorted(h_catalog[key]) + h_stream = list(map(lambda x: x[1], h_catalog[key])) + hypotheses.append(assemble_stream(h_stream)) + references.append(r_catalog[key]) + + wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references) + print("Word Error Rate: {:}%, accuracy={:}%".format( wer * 100, (1 - wer) * 100)) + +if __name__ == '__main__': + main() diff --git a/speech2text/download_dataset.sh b/speech2text/download_dataset.sh index 37a3e97f99..8887986387 100644 --- a/speech2text/download_dataset.sh +++ b/speech2text/download_dataset.sh @@ -46,3 +46,15 @@ python utils/repackage_librispeech.py --manifest ${DATA_DIR}/dev-all.json \ --data_dir ${DATA_DIR} \ --output_dir ${DATA_DIR}/dev-all-repack \ --output_json /data/dev-all-repack.json + +# Repackages Librispeech into fully-constructed samples (median ~180s) +python utils/long_librispeech.py --manifest ${DATA_DIR}/dev-all.json \ + --data_dir ${DATA_DIR} \ + --output_dir ${DATA_DIR}/dev-all-extended \ + --output_json /data/dev-all-extended.json + +# Repackages Librispeech into server-sized samples from the extended +python utils/stream_librispeech.py --manifest ${DATA_DIR}/dev-all-extended.json \ + --data_dir ${DATA_DIR} \ + --output_dir ${DATA_DIR}/dev-all-server \ + --output_json /data/dev-all-server.json diff --git a/speech2text/helpers.py b/speech2text/helpers.py index 279a3d396c..1cb810c8da 100644 --- a/speech2text/helpers.py +++ b/speech2text/helpers.py @@ -17,6 +17,41 @@ from legacy_helpers import __levenshtein +def assemble_stream(stream): + # 'stream' is a list of sentence fragments (strings) + committed_list = [] + prev_list = [] + for clip in stream: +# print("clip: " + str(clip)) + wordlist = clip.split() + merged = [] + max_metric = -1 +# print("prev: " + str(" ".join(prev_list))) +# print("list: " + str(" ".join(wordlist))) + if len(committed_list) == 0: + committed_list = wordlist + prev_list = wordlist + continue + + for i in range(len(prev_list)): + for j in range(len(wordlist)): + merged = prev_list[:(len(prev_list)-i)] + wordlist[j:] + metric = len(merged) - __levenshtein(merged, prev_list) - __levenshtein(merged, wordlist) + if (metric > max_metric) or (metric == max_metric and j > i): + max_metric = metric + new_commit = committed_list[:(len(committed_list)-i)] + wordlist[j:] +# print("new_commit: " + str(new_commit)) + try: + committed_list = new_commit + except NameError: + committed_list = committed_list + prev_list = wordlist +# print("committed_list: " + str(" ".join(committed_list))) +# print(" ") + + return " ".join(committed_list) + + def compute_wer_with_concatenation(prediction, reference): """ Compute WER considering concatenated words as correct matches using kaldialign diff --git a/speech2text/reference_mlperf.py b/speech2text/reference_mlperf.py index 782779126c..11e74ca488 100644 --- a/speech2text/reference_mlperf.py +++ b/speech2text/reference_mlperf.py @@ -57,11 +57,17 @@ def get_args(): return args +# Temporarily disabling server mode for testing scenario_map = { "Offline": lg.TestScenario.Offline, - "Server": lg.TestScenario.Server, + "Server": lg.TestScenario.Offline, } +#scenario_map = { +# "Offline": lg.TestScenario.Offline, +# "Server": lg.TestScenario.Server, +#} + def main(): args = get_args() @@ -103,15 +109,27 @@ def main(): sut.stop() if args.accuracy: - cmd = [ - "python3", - "accuracy_eval.py", - "--log_dir", - log_path, - "--dataset_dir", - args.dataset_dir, - "--manifest", - args.manifest] + if args.scenario == "Offline": + cmd = [ + "python3", + "accuracy_eval.py", + "--log_dir", + log_path, + "--dataset_dir", + args.dataset_dir, + "--manifest", + args.manifest] + else: + cmd = [ + "python3", + "accuracy_eval_server.py", + "--log_dir", + log_path, + "--dataset_dir", + args.dataset_dir, + "--manifest", + args.manifest] + print(f"Running accuracy script: {cmd}") subprocess.check_call(cmd) diff --git a/speech2text/reference_mlperf_accuracy_server.sh b/speech2text/reference_mlperf_accuracy_server.sh new file mode 100644 index 0000000000..6a11455116 --- /dev/null +++ b/speech2text/reference_mlperf_accuracy_server.sh @@ -0,0 +1,52 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# +# THIS IS A GENERATED DOCKERFILE. +# +# This file was assembled from multiple pieces, whose use is documented +# throughout. Please refer to the TensorFlow dockerfiles documentation +# for more information. + +#/bin/bash + +echo "Time Start: $(date +%s)" +export WORKSPACE_DIR="/workspace" +export DATA_DIR="/data" +export MANIFEST_FILE="${DATA_DIR}/dev-all-server.json" +export RUN_LOGS=${WORKSPACE_DIR}/run_output +export SCENARIO="Server" + +export NUM_CORES=$(($(lscpu | grep "Socket(s):" | awk '{print $2}') * $(lscpu | grep "Core(s) per socket:" | awk '{print $4}'))) +export NUM_NUMA_NODES=$(lscpu | grep "NUMA node(s)" | awk '{print $NF}') +export CORES_PER_INST=$((${NUM_CORES} / ${NUM_NUMA_NODES})) +export OMP_NUM_THREADS=${CORES_PER_INST} +export INSTS_PER_NODE=1 +export NUM_INSTS=$((${NUM_NUMA_NODES} * ${INSTS_PER_NODE})) + +export START_CORES=$(lscpu | grep "NUMA node.* CPU.*" | awk "{print \$4}" | cut -d "-" -f 1 | paste -s -d ',') + +echo "CORES_PER_INST: ${CORES_PER_INST}" +echo "NUM_INSTS: ${NUM_INSTS}" +echo "START_CORES: ${START_CORES}" + +python reference_mlperf.py \ + --dataset_dir ${DATA_DIR} \ + --manifest ${MANIFEST_FILE} \ + --scenario ${SCENARIO} \ + --log_dir ${RUN_LOGS} \ + --num_workers ${NUM_INSTS} \ + "--accuracy" + +echo "Time Stop: $(date +%s)" diff --git a/speech2text/utils/long_librispeech.py b/speech2text/utils/long_librispeech.py new file mode 100644 index 0000000000..1ca46d6ca2 --- /dev/null +++ b/speech2text/utils/long_librispeech.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import json +import argparse +import librosa +import soundfile as sf +import os +import numpy as np + +PAD_DURATION = 0.1 +SR = 16000 + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--manifest", required=True) + parser.add_argument("--data_dir", required=True) + parser.add_argument("--output_dir", required=True) + parser.add_argument("--output_json", required=True) + args = parser.parse_args() + return args + + +def get_source_name(fname): + basename_list, _ = os.path.splitext(fname) + return "-".join(basename_list.split("-")[:2]) + + +def prepare_clip(current_entry, new_fname): + pad_audio = np.zeros(int(PAD_DURATION * SR)) + new_audio = [] + new_transcript = "" + for clip in current_entry: + if len(new_audio) > 0: + new_audio.append(pad_audio) + new_transcript += " " + new_audio.append(clip[0]) + new_transcript += clip[1]["transcript"] + new_audio = np.concatenate(new_audio) + new_json = get_sample_json(new_audio, new_transcript, new_fname) + return new_audio, new_json + + +def get_sample_json(audio, transcript, fname): + json_file = { + "transcript": transcript, + "files": [ + { + "channels": 1, + "sample_rate": float(SR), + "bitdepth": 16, + "bitrate": 256000.0, + "duration": float(len(audio) / SR), + "num_samples": int(len(audio)), + "encoding": "Signed Integer PCM", + "silent": False, + "fname": fname, + "speed": 1 + } + ], + "original_duration": float(len(audio) / SR), + "original_num_samples": int(len(audio)) + } + return json_file + + +def main(): + args = get_args() + with open(args.manifest, "r") as manifest: + json_data = json.load(manifest) + + os.makedirs(args.output_dir, exist_ok=True) + + catalog = dict() + for data in json_data: + original_fname = data["files"][0]["fname"] + original_transcript = data["transcript"] + original_audio = librosa.load( + os.path.join( + args.data_dir, + original_fname), + sr=SR)[0] + original_json = get_sample_json( + original_audio, original_transcript, original_fname) + + source_name = get_source_name( + os.path.basename( + os.path.basename(original_fname))) + if source_name not in catalog: + catalog[source_name] = [] + + catalog[source_name].append((original_audio, original_json)) + + full_json = [] + for key in catalog.keys(): + current_entry = [] + for entry in catalog[key]: + current_entry.append(entry) + + # After all key clips are processed, if a remaining entry has content, + # exports it. + if len(current_entry) > 0: + new_fname = os.path.join( + args.output_dir, key + ".wav") + new_audio, new_json = prepare_clip(current_entry, new_fname) + sf.write(new_fname, new_audio, SR) + full_json.append(new_json) + + # Creates json manifest containing all newly-repacked clips + with open(args.output_json, "w") as manifest: + json.dump(full_json, manifest, indent=2) + + +if __name__ == "__main__": + main() diff --git a/speech2text/utils/stream_librispeech.py b/speech2text/utils/stream_librispeech.py new file mode 100644 index 0000000000..f93e592aa7 --- /dev/null +++ b/speech2text/utils/stream_librispeech.py @@ -0,0 +1,120 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import json +import argparse +import librosa +import soundfile as sf +import os +import numpy as np + +DURATION = 6.0 +OVERLAP = 4.0 +PAD_DURATION = 0.0 +SR = 16000 + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--manifest", required=True) + parser.add_argument("--data_dir", required=True) + parser.add_argument("--output_dir", required=True) + parser.add_argument("--output_json", required=True) + args = parser.parse_args() + return args + + +def get_source_name(fname): + basename_list, _ = os.path.splitext(fname) + return "-".join(basename_list.split("-")[:2]) + + +def get_sample_json(audio, transcript, fname): + json_file = { + "transcript": transcript, + "files": [ + { + "channels": 1, + "sample_rate": float(SR), + "bitdepth": 16, + "bitrate": 256000.0, + "duration": float(len(audio) / SR), + "num_samples": int(len(audio)), + "encoding": "Signed Integer PCM", + "silent": False, + "fname": fname, + "speed": 1 + } + ], + "original_duration": float(len(audio) / SR), + "original_num_samples": int(len(audio)) + } + return json_file + + +def main(): + args = get_args() + with open(args.manifest, "r") as manifest: + json_data = json.load(manifest) + + os.makedirs(args.output_dir, exist_ok=True) + + pad_audio = np.zeros(int(PAD_DURATION * SR)) + + catalog = dict() + for data in json_data: + original_fname = data["files"][0]["fname"] + original_transcript = data["transcript"] + original_audio = librosa.load( + os.path.join( + args.data_dir, + original_fname), + sr=SR)[0] + original_json = get_sample_json( + original_audio, original_transcript, original_fname) + + source_name = get_source_name( + os.path.basename( + os.path.basename(original_fname))) + if source_name not in catalog: + catalog[source_name] = [] + + catalog[source_name].append((original_audio, original_json)) + + full_json = [] + for key in catalog.keys(): + index = 0 + start = 0.0 + for entry in catalog[key]: + clip_duration = entry[1]["original_duration"] + full_transcript = entry[1]["transcript"] + while start < (clip_duration - OVERLAP): + end = min(start + DURATION, clip_duration) + chunk = entry[0][int(start * SR):int(end * SR)] + new_audio = np.concatenate([chunk, pad_audio]) + new_fname = os.path.join( + args.output_dir, key + "_" + str(index) + ".wav") + new_json = get_sample_json(new_audio, full_transcript, new_fname) + full_json.append(new_json) + sf.write(new_fname, new_audio, SR) + start = end - OVERLAP + index += 1 + + # Creates json manifest containing all newly-repacked clips + with open(args.output_json, "w") as manifest: + json.dump(full_json, manifest, indent=2) + +if __name__ == "__main__": + main()