Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions speech2text/accuracy_eval_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Copyright 2025 The MLPerf Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

import argparse
import array
import json
import sys
import os
from typing import List

from whisper.normalizers import EnglishTextNormalizer

from manifest import Manifest
from legacy_helpers import __levenshtein, __gather_predictions
from helpers import compute_wer_with_concatenation, get_expanded_wordlist, assemble_stream


max_duration = float(os.environ.get("MAX_DURATION", "30.0"))
labels = [
" ",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"'"]
dtype_map = {
"int8": 'b',
"int16": 'h',
"int32": 'l',
"int64": 'q',
}


def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
"""
Computes Average Word Error rate between two texts represented as
corresponding lists of string. Hypotheses and references must have same length.

Args:
hypotheses: list of hypotheses
references: list of references

Returns:
(float) average word error rate
"""
normalizer = EnglishTextNormalizer()

scores = 0
words = 0
if len(hypotheses) != len(references):
raise ValueError("In word error rate calculation, hypotheses and reference"
" lists must have the same number of elements. But I got:"
"{0} and {1} correspondingly".format(len(hypotheses), len(references)))
for h, r in zip(hypotheses, references):
h = normalizer(h)
r = normalizer(r)
h_list = h.split()
r_list = r.split()
scores_clip, words_clip = compute_wer_with_concatenation(
h_list, r_list)
scores += scores_clip
words += words_clip
wer = scores / words
return wer, scores, words


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--log_dir", required=True)
parser.add_argument("--dataset_dir", required=True)
parser.add_argument("--manifest", required=True)
parser.add_argument(
"--output_dtype",
default="int64",
choices=dtype_map.keys(),
help="Output data type")
args = parser.parse_args()
return args


def main():
args = get_args()
manifest = Manifest(args.dataset_dir,
[args.manifest],
labels,
len(labels),
max_duration=max_duration)
with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
results = json.load(fh)

h_catalog = dict()
r_catalog = dict()
for result in results:
file_base = os.path.basename(manifest[result["qsl_idx"]]["audio_filepath"][0]).split('.')[0]
key = str(file_base.split('_')[0])
index = int(file_base.split('_')[1])
h = array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()
h = __gather_predictions([[h]], labels=labels)
h_tuple = (index, h[0])
r = manifest[result["qsl_idx"]]["transcript"]
r = __gather_predictions([[r]], labels=labels)
if key in h_catalog:
h_catalog[key].append(h_tuple)
else:
h_catalog[key] = []
r_catalog[key] = r[0]

hypotheses = []
references = []
for key in h_catalog.keys():
h_catalog[key] = sorted(h_catalog[key])
h_stream = list(map(lambda x: x[1], h_catalog[key]))
hypotheses.append(assemble_stream(h_stream))
references.append(r_catalog[key])

wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references)
print("Word Error Rate: {:}%, accuracy={:}%".format( wer * 100, (1 - wer) * 100))

if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions speech2text/download_dataset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,15 @@ python utils/repackage_librispeech.py --manifest ${DATA_DIR}/dev-all.json \
--data_dir ${DATA_DIR} \
--output_dir ${DATA_DIR}/dev-all-repack \
--output_json /data/dev-all-repack.json

# Repackages Librispeech into fully-constructed samples (median ~180s)
python utils/long_librispeech.py --manifest ${DATA_DIR}/dev-all.json \
--data_dir ${DATA_DIR} \
--output_dir ${DATA_DIR}/dev-all-extended \
--output_json /data/dev-all-extended.json

# Repackages Librispeech into server-sized samples from the extended
python utils/stream_librispeech.py --manifest ${DATA_DIR}/dev-all-extended.json \
--data_dir ${DATA_DIR} \
--output_dir ${DATA_DIR}/dev-all-server \
--output_json /data/dev-all-server.json
35 changes: 35 additions & 0 deletions speech2text/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,41 @@
from legacy_helpers import __levenshtein


def assemble_stream(stream):
# 'stream' is a list of sentence fragments (strings)
committed_list = []
prev_list = []
for clip in stream:
# print("clip: " + str(clip))
wordlist = clip.split()
merged = []
max_metric = -1
# print("prev: " + str(" ".join(prev_list)))
# print("list: " + str(" ".join(wordlist)))
if len(committed_list) == 0:
committed_list = wordlist
prev_list = wordlist
continue

for i in range(len(prev_list)):
for j in range(len(wordlist)):
merged = prev_list[:(len(prev_list)-i)] + wordlist[j:]
metric = len(merged) - __levenshtein(merged, prev_list) - __levenshtein(merged, wordlist)
if (metric > max_metric) or (metric == max_metric and j > i):
max_metric = metric
new_commit = committed_list[:(len(committed_list)-i)] + wordlist[j:]
# print("new_commit: " + str(new_commit))
try:
committed_list = new_commit
except NameError:
committed_list = committed_list
prev_list = wordlist
# print("committed_list: " + str(" ".join(committed_list)))
# print(" ")

return " ".join(committed_list)


def compute_wer_with_concatenation(prediction, reference):
"""
Compute WER considering concatenated words as correct matches using kaldialign
Expand Down
38 changes: 28 additions & 10 deletions speech2text/reference_mlperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,17 @@ def get_args():
return args


# Temporarily disabling server mode for testing
scenario_map = {
"Offline": lg.TestScenario.Offline,
"Server": lg.TestScenario.Server,
"Server": lg.TestScenario.Offline,
}

#scenario_map = {
# "Offline": lg.TestScenario.Offline,
# "Server": lg.TestScenario.Server,
#}


def main():
args = get_args()
Expand Down Expand Up @@ -103,15 +109,27 @@ def main():
sut.stop()

if args.accuracy:
cmd = [
"python3",
"accuracy_eval.py",
"--log_dir",
log_path,
"--dataset_dir",
args.dataset_dir,
"--manifest",
args.manifest]
if args.scenario == "Offline":
cmd = [
"python3",
"accuracy_eval.py",
"--log_dir",
log_path,
"--dataset_dir",
args.dataset_dir,
"--manifest",
args.manifest]
else:
cmd = [
"python3",
"accuracy_eval_server.py",
"--log_dir",
log_path,
"--dataset_dir",
args.dataset_dir,
"--manifest",
args.manifest]

print(f"Running accuracy script: {cmd}")
subprocess.check_call(cmd)

Expand Down
52 changes: 52 additions & 0 deletions speech2text/reference_mlperf_accuracy_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
#
# THIS IS A GENERATED DOCKERFILE.
#
# This file was assembled from multiple pieces, whose use is documented
# throughout. Please refer to the TensorFlow dockerfiles documentation
# for more information.

#/bin/bash

echo "Time Start: $(date +%s)"
export WORKSPACE_DIR="/workspace"
export DATA_DIR="/data"
export MANIFEST_FILE="${DATA_DIR}/dev-all-server.json"
export RUN_LOGS=${WORKSPACE_DIR}/run_output
export SCENARIO="Server"

export NUM_CORES=$(($(lscpu | grep "Socket(s):" | awk '{print $2}') * $(lscpu | grep "Core(s) per socket:" | awk '{print $4}')))
export NUM_NUMA_NODES=$(lscpu | grep "NUMA node(s)" | awk '{print $NF}')
export CORES_PER_INST=$((${NUM_CORES} / ${NUM_NUMA_NODES}))
export OMP_NUM_THREADS=${CORES_PER_INST}
export INSTS_PER_NODE=1
export NUM_INSTS=$((${NUM_NUMA_NODES} * ${INSTS_PER_NODE}))

export START_CORES=$(lscpu | grep "NUMA node.* CPU.*" | awk "{print \$4}" | cut -d "-" -f 1 | paste -s -d ',')

echo "CORES_PER_INST: ${CORES_PER_INST}"
echo "NUM_INSTS: ${NUM_INSTS}"
echo "START_CORES: ${START_CORES}"

python reference_mlperf.py \
--dataset_dir ${DATA_DIR} \
--manifest ${MANIFEST_FILE} \
--scenario ${SCENARIO} \
--log_dir ${RUN_LOGS} \
--num_workers ${NUM_INSTS} \
"--accuracy"

echo "Time Stop: $(date +%s)"
Loading