mlcommons · keithachorn-intel · Sep 30, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 21, 2025
@@ -0,0 +1,154 @@
+# Copyright 2025 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import argparse
+import array
+import json
+import sys
+import os
+from typing import List
+
+from whisper.normalizers import EnglishTextNormalizer
+
+from manifest import Manifest
+from legacy_helpers import __levenshtein, __gather_predictions
+from helpers import compute_wer_with_concatenation, get_expanded_wordlist, assemble_stream
+
+
+max_duration = float(os.environ.get("MAX_DURATION", "30.0"))
+labels = [
+    " ",
+    "a",
+    "b",
+    "c",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "q",
+    "r",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+    "'"]
+dtype_map = {
+    "int8": 'b',
+    "int16": 'h',
+    "int32": 'l',
+    "int64": 'q',
+}
+
+
+def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
+    """
+    Computes Average Word Error rate between two texts represented as
+    corresponding lists of string. Hypotheses and references must have same length.
+
+    Args:
+        hypotheses: list of hypotheses
+        references: list of references
+
+    Returns:
+        (float) average word error rate
+    """
+    normalizer = EnglishTextNormalizer()
+
+    scores = 0
+    words = 0
+    if len(hypotheses) != len(references):
+        raise ValueError("In word error rate calculation, hypotheses and reference"
+                         " lists must have the same number of elements. But I got:"
+                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+    for h, r in zip(hypotheses, references):
+        h = normalizer(h)
+        r = normalizer(r)
+        h_list = h.split()
+        r_list = r.split()
+        scores_clip, words_clip = compute_wer_with_concatenation(
+            h_list, r_list)
+        scores += scores_clip
+        words += words_clip
+    wer = scores / words
+    return wer, scores, words
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", required=True)
+    parser.add_argument("--dataset_dir", required=True)
+    parser.add_argument("--manifest", required=True)
+    parser.add_argument(
+        "--output_dtype",
+        default="int64",
+        choices=dtype_map.keys(),
+        help="Output data type")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    manifest = Manifest(args.dataset_dir,
+                        [args.manifest],
+                        labels,
+                        len(labels),
+                        max_duration=max_duration)
+    with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
+        results = json.load(fh)
+
+    h_catalog = dict()
+    r_catalog = dict()
+    for result in results:
+        file_base = os.path.basename(manifest[result["qsl_idx"]]["audio_filepath"][0]).split('.')[0]
+        key = str(file_base.split('_')[0])
+        index = int(file_base.split('_')[1])
+        h = array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()
+        h = __gather_predictions([[h]], labels=labels)
+        h_tuple = (index, h[0])
+        r = manifest[result["qsl_idx"]]["transcript"]
+        r = __gather_predictions([[r]], labels=labels)
+        if key in h_catalog:
+            h_catalog[key].append(h_tuple)
+        else:
+            h_catalog[key] = []
+            r_catalog[key] = r[0]
+
+    hypotheses = []
+    references = []
+    for key in h_catalog.keys():
+        h_catalog[key] = sorted(h_catalog[key])
+        h_stream = list(map(lambda x: x[1], h_catalog[key]))
+        hypotheses.append(assemble_stream(h_stream))
+        references.append(r_catalog[key])
+
+    wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references)
+    print("Word Error Rate: {:}%, accuracy={:}%".format( wer * 100, (1 - wer) * 100))
+
+if __name__ == '__main__':
+    main()
@@ -46,3 +46,15 @@ python utils/repackage_librispeech.py --manifest ${DATA_DIR}/dev-all.json \
 	                              --data_dir ${DATA_DIR} \
 				      --output_dir ${DATA_DIR}/dev-all-repack \
 				      --output_json /data/dev-all-repack.json
+
+# Repackages Librispeech into fully-constructed samples (median ~180s)
+python utils/long_librispeech.py --manifest ${DATA_DIR}/dev-all.json \
+                                      --data_dir ${DATA_DIR} \
+                                      --output_dir ${DATA_DIR}/dev-all-extended \
+                                      --output_json /data/dev-all-extended.json
+
+# Repackages Librispeech into server-sized samples from the extended
+python utils/stream_librispeech.py --manifest ${DATA_DIR}/dev-all-extended.json \
+                                      --data_dir ${DATA_DIR} \
+                                      --output_dir ${DATA_DIR}/dev-all-server \
+                                      --output_json /data/dev-all-server.json
@@ -17,6 +17,41 @@
 from legacy_helpers import __levenshtein
 
 
+def assemble_stream(stream):
+    # 'stream' is a list of sentence fragments (strings)
+    committed_list = []
+    prev_list = []
+    for clip in stream:
+#        print("clip: " + str(clip))
+        wordlist = clip.split()
+        merged = []
+        max_metric = -1
+#        print("prev: " + str(" ".join(prev_list)))
+#        print("list: " + str(" ".join(wordlist)))
+        if len(committed_list) == 0:
+            committed_list = wordlist
+            prev_list = wordlist
+            continue
+
+        for i in range(len(prev_list)):
+            for j in range(len(wordlist)):
+                merged = prev_list[:(len(prev_list)-i)] + wordlist[j:]
+                metric = len(merged) - __levenshtein(merged, prev_list) - __levenshtein(merged, wordlist)
+                if (metric > max_metric) or  (metric == max_metric and j > i):
+                    max_metric = metric
+                    new_commit = committed_list[:(len(committed_list)-i)] + wordlist[j:]
+#                    print("new_commit: " + str(new_commit))
+        try:
+            committed_list = new_commit
+        except NameError:
+            committed_list = committed_list
+        prev_list = wordlist
+#        print("committed_list: " + str(" ".join(committed_list)))
+#        print(" ")
+
+    return " ".join(committed_list)
+
+
 def compute_wer_with_concatenation(prediction, reference):
     """
     Compute WER considering concatenated words as correct matches using kaldialign

@@ -57,11 +57,17 @@ def get_args():
     return args
 
 
+# Temporarily disabling server mode for testing
 scenario_map = {
     "Offline": lg.TestScenario.Offline,
-    "Server": lg.TestScenario.Server,
+    "Server": lg.TestScenario.Offline,
 }
 
+#scenario_map = {
+#    "Offline": lg.TestScenario.Offline,
+#    "Server": lg.TestScenario.Server,
+#}
+
 
 def main():
     args = get_args()
@@ -103,15 +109,27 @@ def main():
     sut.stop()
 
     if args.accuracy:
-        cmd = [
-            "python3",
-            "accuracy_eval.py",
-            "--log_dir",
-            log_path,
-            "--dataset_dir",
-            args.dataset_dir,
-            "--manifest",
-            args.manifest]
+        if args.scenario == "Offline":
+            cmd = [
+                "python3",
+                "accuracy_eval.py",
+                "--log_dir",
+                log_path,
+                "--dataset_dir",
+                args.dataset_dir,
+                "--manifest",
+                args.manifest]
+        else:
+            cmd = [
+                "python3",
+                "accuracy_eval_server.py",
+                "--log_dir",
+                log_path,
+                "--dataset_dir",
+                args.dataset_dir,
+                "--manifest",
+                args.manifest]
+
         print(f"Running accuracy script: {cmd}")
         subprocess.check_call(cmd)
 

@@ -0,0 +1,52 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+#/bin/bash
+
+echo "Time Start: $(date +%s)"
+export WORKSPACE_DIR="/workspace"
+export DATA_DIR="/data"
+export MANIFEST_FILE="${DATA_DIR}/dev-all-server.json"
+export RUN_LOGS=${WORKSPACE_DIR}/run_output
+export SCENARIO="Server"
+
+export NUM_CORES=$(($(lscpu | grep "Socket(s):" | awk '{print $2}') * $(lscpu | grep "Core(s) per socket:" | awk '{print $4}')))
+export NUM_NUMA_NODES=$(lscpu | grep "NUMA node(s)" | awk '{print $NF}')
+export CORES_PER_INST=$((${NUM_CORES} / ${NUM_NUMA_NODES}))
+export OMP_NUM_THREADS=${CORES_PER_INST}
+export INSTS_PER_NODE=1
+export NUM_INSTS=$((${NUM_NUMA_NODES} * ${INSTS_PER_NODE}))
+
+export START_CORES=$(lscpu | grep "NUMA node.* CPU.*" | awk "{print \$4}" | cut -d "-" -f 1 | paste -s -d ',')
+
+echo "CORES_PER_INST: ${CORES_PER_INST}"
+echo "NUM_INSTS: ${NUM_INSTS}"
+echo "START_CORES: ${START_CORES}"
+
+python reference_mlperf.py \
+    --dataset_dir ${DATA_DIR} \
+    --manifest ${MANIFEST_FILE} \
+    --scenario ${SCENARIO} \
+    --log_dir ${RUN_LOGS} \
+    --num_workers ${NUM_INSTS} \
+    "--accuracy"
+
+echo "Time Stop: $(date +%s)"