Merge branch 'master' into fix/preprocessing-documentation

anivar · web-flow · commit c6aa29aada9e · 2025-08-03T01:07:02.000+05:30
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Please see the [MLPerf Inference benchmark paper](https://arxiv.org/abs/1911.025
 
 Please see [here](https://docs.mlcommons.org/inference/benchmarks/) for the MLPerf inference documentation website which includes automated commands to run MLPerf inference benchmarks using different implementations.
 
-## MLPerf Inference v5.1 (submission deadline July 25, 2025)
+## MLPerf Inference v5.1 (submission deadline August 1, 2025)
 
 For submissions, please use the master branch and any commit since the [5.1 seed release (soon to be released)]() although it is best to use the latest commit in the [master branch](https://github.com/mlcommons/inference).
 
diff --git a/language/deepseek-r1/README.md b/language/deepseek-r1/README.md
@@ -116,7 +116,7 @@ The setup script creates a virtual environment and configures it differently bas
 
 ### PyTorch Backend (Distributed)
 
-> ⚠️ **IMPORTANT NOTE**: The PyTorch reference implementation takes approximately 8 days to run on an H200x8 system. This is because large max-OSL (32K) limits concurrency (max-BS=16), and unoptimized pytorch forward and decode logics.
+> ⚠️ **IMPORTANT NOTE**: The PyTorch reference implementation takes approximately upto 8 days to run on an H200x8 system. This is because large max-OSL (20K) limits concurrency (max-BS=16), and unoptimized pytorch forward and decode logics.
 
 PyTorch backend uses distributed execution with `torchrun` and `run_eval_mpi.py`:
 
@@ -222,8 +222,8 @@ Pytorch reference scores:
 
 ```bash
 Evaluation Results: {
-  "mean-accuracy": 81.67730173199635,
-  "mean-output-tok-len": 4043.449863263446,
+  "mean-accuracy": 81.3582,
+  "mean-output-tok-len": 3886.2274,
   "num-samples": 4388
 }
 ```
diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py
@@ -770,14 +770,12 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         # 'evaluated': int(evaluated),
         # 'correct': int(correct),
         'exact_match': float(accuracy),
-        'TOKENS_PER_SAMPLE': mean_output_len,
+        'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
     }
-
-    result_str = json.dumps(results, indent=2)
-    print(f"\nEvaluation Results: {result_str}")
-
-    return results
+    
+    print("\nResults\n")
+    print(results)
 
 
 def process_and_save_dataframe(df: pd.DataFrame,
diff --git a/language/deepseek-r1/utils/backend_registry.py b/language/deepseek-r1/utils/backend_registry.py
@@ -5,7 +5,7 @@
 
 # Configuration constants
 MAX_ISL = 3136  # max input sequence length
-MAX_OSL = 32 * 1024  # max output sequence length
+MAX_OSL = 20 * 1000  # max output sequence length
 MAX_TEMPLATE_TOKS = 4  # max template tokens
 MODEL_REVISION = "56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad"
 
diff --git a/language/llama3.1-8b/README.md b/language/llama3.1-8b/README.md
@@ -171,7 +171,7 @@ mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname=<path
 
 **Native method**
 ```
-rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/datasets/sample_cnn_eval_5000.json ./ -P
+rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/datasets/cnn_eval_5000.json ./ -P
 ```
 
 #### Calibration
@@ -200,7 +200,7 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.jso
 python -u main.py --scenario Offline \
                 --model-path ${CHECKPOINT_PATH} \
                 --batch-size 16 \
-                --dtype float16 \
+                --dtype bfloat16 \
                 --user-conf user.conf \
                 --total-sample-count 13368 \
                 --dataset-path ${DATASET_PATH} \
@@ -215,7 +215,7 @@ python -u main.py --scenario Offline \
 python -u main.py --scenario Server \
                 --model-path ${CHECKPOINT_PATH} \
                 --batch-size 16 \
-                --dtype float16 \
+                --dtype bfloat16 \
                 --user-conf user.conf \
                 --total-sample-count 13368 \
                 --dataset-path ${DATASET_PATH} \
@@ -238,7 +238,7 @@ python -u main.py --scenario Offline \
                 --model-path ${CHECKPOINT_PATH} \
                 --batch-size 16 \
                 --accuracy \
-                --dtype float16 \
+                --dtype bfloat16 \
                 --user-conf user.conf \
                 --total-sample-count 13368 \
                 --dataset-path ${DATASET_PATH} \
@@ -265,7 +265,7 @@ python -u main.py --scenario Server \
                 --model-path ${CHECKPOINT_PATH} \
                 --batch-size 16 \
                 --accuracy \
-                --dtype float16 \
+                --dtype bfloat16 \
                 --user-conf user.conf \
                 --total-sample-count 13368 \
                 --dataset-path ${DATASET_PATH} \
@@ -282,6 +282,34 @@ fi
 
 The ServerSUT was not tested for GPU runs.
 
+### Edge
+```
+OUTPUT_LOG_DIR=offline-accuracy-logs
+
+mkdir -p "run_outputs"  # The script will dump all the outputs to 'run_outputs'.
+
+python -u main.py --lg-model-name llama3_1-8b-edge \       
+                --scenario Offline \
+                --model-path ${CHECKPOINT_PATH} \
+                --batch-size 16 \
+                --accuracy \
+                --dtype bfloat16 \
+                --user-conf user.conf \
+                --total-sample-count 13368 \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir output \
+                --tensor-parallel-size ${GPU_COUNT} \
+                --vllm
+
+
+ACCURACY_LOG_FILE=${OUTPUT_LOG_DIR}/mlperf_log_accuracy.json
+if [ -e ${ACCURACY_LOG_FILE} ]; then
+        python evaluation.py --mlperf-accuracy-file ${ACCURACY_LOG_FILE} \
+                --dataset-file ${DATASET_PATH} --dtype int32
+fi
+```
+
+
 ### Evaluate the accuracy using MLCFlow
 You can also evaulate the accuracy from the generated accuracy log by using the following MLC command
 
@@ -298,7 +326,8 @@ mlcr run,accuracy,mlperf,_cnndm_llama_3,_datacenter --result_dir=<Path to direct
 ```
 
 ## Accuracy Target
-Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets:
+### Datacenter
+Running the GPU implementation in BF16 precision resulted in the following BF16 accuracy targets:
 ```
 {
         'rouge1': 38.7792,
@@ -310,3 +339,16 @@ Running the GPU implementation in FP16 precision resulted in the following FP16
 }
 ```
 The accuracy target is 99% for rouge1, rouge2, rougeL and rougeLsum, and 90% for gen_len
+
+### Edge
+Running the GPU implementation in BF16 precision resulted in the following BF16 accuracy targets:
+```
+{
+        'rouge1': 39.06,
+        'rouge2': 16.1147,
+        'rougeL': 24.6375,
+        'rougeLsum': 36.124,
+        'gen_len': 3051113,
+        'gen_num': 5000,
+}
+```
diff --git a/language/llama3.1-8b/download_cnndm.py b/language/llama3.1-8b/download_cnndm.py
@@ -69,13 +69,23 @@ def get_args():
     os.makedirs(save_dataset_path)
 
 # Load dataset from the hub
-dataset = load_dataset(dataset_id, name=dataset_config)
+dataset = load_dataset(dataset_id, name=dataset_config, split="validation")
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 tokenizer.padding_side = "left"
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.model_max_length = 8000
 
+print(f"Dshape: {dataset.shape}; type(dataset)")
+ind = set(range(dataset.shape[0]))
+if n_samples:
+    import random
+    random.seed(42)
+    dataset = dataset.shuffle(seed=42)
+    dataset = dataset.flatten_indices()
+    dataset = dataset.take(n_samples)
+    ind = set(random.sample(range(0, 13368), n_samples))
+
 
 instruction_template = {
     "llama": (
@@ -90,41 +100,34 @@ def preprocess_function(sample, padding="max_length"):
     # create list of samples
     inputs = []
 
-    if n_samples:
-        import random
-        random.seed(42)
-        ind = random.sample(range(0, 13368), n_samples)
-    else:
-        ind = list(range(0, len(sample[text_column])))
-
-    for i in range(0, len(sample[text_column])):
-        if i in ind:
-            x = dict()
-            x["instruction"] = instruction_template
-            x["input"] = sample[text_column][i]
-            x["tok_input"] = tokenizer.encode(
-                instruction_template[instruction].format_map(x))
-            x["output"] = sample[summary_column][i]
-            inputs.append(x)
+    #print(f"Num samples: {len(sample[text_column])}")
+    #for i in range(0, len(sample[text_column])):
+    x = dict()
+    x["instruction"] = instruction_template
+    x["input"] = sample[text_column]
+    x["tok_input"] = tokenizer.encode(
+        instruction_template[instruction].format_map(x)
+    )
+    x["output"] = sample[summary_column]
+    #inputs.append(x)
     model_inputs = dict()
-    model_inputs["text"] = inputs
+    model_inputs["text"] = x
 
     return model_inputs
 
 
 # process dataset
-tokenized_dataset = dataset.map(
-    preprocess_function, batched=True, remove_columns=list(dataset["train"].features)
-)
+tokenized_dataset = dataset.map(preprocess_function, batched=False)
 
 # save dataset to disk
 if n_samples is None:
     file = "cnn_eval.json"
 else:
-    file = f"sample_cnn_eval_{n_samples}.json"
+    file = f"cnn_eval_{n_samples}.json"
 
+print(f"Num of tokenized dataset: {len(tokenized_dataset['text'])}")
 with open(os.path.join(save_dataset_path, file), "w") as write_f:
     json.dump(
-        tokenized_dataset["validation"]["text"], write_f, indent=4, ensure_ascii=False
+        tokenized_dataset["text"], write_f, indent=4, ensure_ascii=False
     )
 print("Dataset saved in ", save_dataset_path)
diff --git a/language/llama3.1-8b/evaluation.py b/language/llama3.1-8b/evaluation.py
@@ -70,7 +70,7 @@ def main():
 
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
-        model_max_length=2048,
+        model_max_length=128000,
         padding_side="left",
         use_fast=False,
     )
diff --git a/language/llama3.1-8b/main.py b/language/llama3.1-8b/main.py
@@ -165,7 +165,7 @@ def main():
     else:
         raise NotImplementedError
 
-    sut_map = {"offline": SUT, "server": SUTServer, "singlestream": SUT}
+    sut_map = {"offline": SUT, "server": SUTServer, "singlestream": SUTServer}
 
     sut_cls = sut_map[args.scenario.lower()]
 
diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py
@@ -215,14 +215,15 @@ def process_queries(self):
         for output in outputs:
             request_id = int(output.request_id)
             vllm_text = output.outputs[0].text
-            results.append(vllm_text)
+            results.append((vllm_text, len(output.outputs[0].token_ids)))
             query_ids.append(self.query_idx_mapping[request_id])
             qid.append(self.qid_mapping[request_id])
 
         self.num_samples += len(results)
 
-        for i, result in enumerate(results):
+        for i, result_tuple in enumerate(results):
             # Whisper outputs space in the front and capitalizes things
+            result, n_tokens = result_tuple
             result = result.lower().strip()
             transcript = []
             for s in result:
@@ -233,7 +234,7 @@ def process_queries(self):
             assert len(transcript) == 1
             response_array = array.array('q', transcript[0])
 
-            self.output_queue.put((qid[i], response_array))
+            self.output_queue.put((qid[i], n_tokens, response_array))
             print(f"Finished {qid[i]}")
         return True
 
@@ -330,14 +331,13 @@ def flush_queries(self):
     def response_loadgen(self):
         keep_alive = True
         while keep_alive:
-            result = self.output_queue.get()
-            if result is None:
+            qid, n_tokens, response_array = self.output_queue.get()
+            if qid is None:
                 keep_alive = False
             else:
-                qid, response_array = result
                 bi = response_array.buffer_info()
                 response = lg.QuerySampleResponse(qid, bi[0],
-                                                  bi[1] * response_array.itemsize)
+                                                  bi[1] * response_array.itemsize, n_tokens)
                 lg.QuerySamplesComplete([response])
 
     def stop(self):
diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
@@ -223,9 +223,9 @@ def main():
                 "llama2-70b-99.9": ["Server", "Offline", "Interactive"],
                 "mixtral-8x7b": ["Server", "Offline"],
                 "rgat": ["Offline"],
-                "llama3.1-405b": ["Offline", "Server"],
+                "llama3.1-405b": ["Server", "Offline", "Interactive"],
                 "pointpainting": [],
-                "llama3.1-8b": ["Server", "Offline"],
+                "llama3.1-8b": ["Server", "Offline", "Interactive"],
                 "deepseek-r1": ["Server", "Offline"],
                 "whisper": ["Offline"],
             },
diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
@@ -224,7 +224,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
                         power_is_valid,
                         power_metric,
                         power_efficiency,
-                    ) = check_power_dir(
+                    ) = checker.check_power_dir(
                         power_path,
                         ranging_path,
                         perf_path,
@@ -234,6 +234,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
                         config,
                     )
                 except Exception as e:
+                    log.error(e)
                     power_is_valid = False
                 if not power_is_valid:
                     log.warning(
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ def main():`
`70`	`70`
`71`	`71`	`tokenizer = AutoTokenizer.from_pretrained(`
`72`	`72`	`model_name,`
`73`		`- model_max_length=2048,`
	`73`	`+ model_max_length=128000,`
`74`	`74`	`padding_side="left",`
`75`	`75`	`use_fast=False,`
`76`	`76`	`)`