diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index b58896458a..4be04f37ef 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -404,7 +404,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS"; return out; } @@ -662,7 +662,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu index 4324e68038..636810735b 100644 --- a/examples/10_planar_complex/planar_complex.cu +++ b/examples/10_planar_complex/planar_complex.cu @@ -416,7 +416,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -476,7 +476,7 @@ public: } std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; return result; } diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu index aa5a8f02c8..503c6ccdd6 100644 --- a/examples/11_planar_complex_array/planar_complex_array.cu +++ b/examples/11_planar_complex_array/planar_complex_array.cu @@ -477,7 +477,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -537,7 +537,7 @@ public: } std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; return result; } diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu index 895b52b14d..b02e200651 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu @@ -369,7 +369,7 @@ int run(Options &options) { return -1; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -412,7 +412,7 @@ int run(Options &options) { if (passed) { std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; } std::cout << (passed ? "Passed" : "Failed") << std::endl; diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu index 91f6a4bbe0..b8804049ac 100644 --- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu +++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu @@ -499,7 +499,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS"; return out; } diff --git a/examples/21_quaternion_gemm/quaternion_gemm.cu b/examples/21_quaternion_gemm/quaternion_gemm.cu index dd2476a4fe..e8a8325a39 100644 --- a/examples/21_quaternion_gemm/quaternion_gemm.cu +++ b/examples/21_quaternion_gemm/quaternion_gemm.cu @@ -378,7 +378,7 @@ int run(Options options) { return -1; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -424,7 +424,7 @@ int run(Options options) { if (passed) { std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; } std::cout << (passed ? "Passed" : "Failed") << std::endl; diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu index 170f978b88..7f68538569 100644 --- a/examples/22_quaternion_conv/quaternion_conv.cu +++ b/examples/22_quaternion_conv/quaternion_conv.cu @@ -321,7 +321,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS"; return out; } @@ -577,7 +577,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu index d0b0aa0688..42d3876e07 100644 --- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu +++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu @@ -678,7 +678,7 @@ Result profile(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); // Cleanup diff --git a/examples/24_gemm_grouped/gemm_grouped.cu b/examples/24_gemm_grouped/gemm_grouped.cu index 9dbe03b1aa..3b0112c337 100644 --- a/examples/24_gemm_grouped/gemm_grouped.cu +++ b/examples/24_gemm_grouped/gemm_grouped.cu @@ -268,7 +268,7 @@ struct Options { output_file.open(output_path.c_str(), open_mode); if (output_file.good() && open_mode != std::ios_base::app) { - output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n"; + output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPS\n"; } } @@ -1118,7 +1118,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(this->options.iterations); result.gflops = this->options.gflops(result.runtime_ms / 1000.0); @@ -1139,7 +1139,7 @@ public: std::cout << " " << this->options.problem_bins.size() << " batched GEMMs launched" << std::endl; std::cout << std::endl; std::cout << " " << "Batched Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << "Batched GFLOPs: " << result.gflops << std::endl; + std::cout << " " << "Batched GFLOPS: " << result.gflops << std::endl; std::string provider = "CUTLASS"; @@ -1380,7 +1380,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(this->options.iterations); result.gflops = this->options.gflops(result.runtime_ms / 1000.0); @@ -1413,7 +1413,7 @@ public: std::cout << std::endl; std::cout << " " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << "Grouped GFLOPs: " << result.gflops << std::endl; + std::cout << " " << "Grouped GFLOPS: " << result.gflops << std::endl; if (this->options.profile_initialization) { std::cout << " " << "Init Runtime: " << result.initialization_time_ms << " ms" << std::endl; } diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu index 285a0afab7..b8597b64dc 100644 --- a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu +++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu @@ -347,7 +347,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,D,H,W,C,K,T,R,S,Stride_D,Stride_H,Stride_W,Runtime,GFLOPs"; + out << "Layer,N,D,H,W,C,K,T,R,S,Stride_D,Stride_H,Stride_W,Runtime,GFLOPS"; return out; } @@ -659,7 +659,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu index db33764624..5c6ad4f25e 100644 --- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu +++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu @@ -340,7 +340,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS"; return out; } @@ -651,7 +651,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu index e983e8550c..391c195df9 100644 --- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu +++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu @@ -337,7 +337,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS"; return out; } @@ -649,7 +649,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu index e30718fefc..3249dd371a 100644 --- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu +++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu @@ -532,7 +532,7 @@ bool run(Options &options) { return false; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.m = problem_size.m(); result.n = problem_size.n(); result.k = problem_size.k(); @@ -661,7 +661,7 @@ bool run(Options &options) { std::cout.precision(4); std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; std::cout.precision(2); - std::cout << "GFLOPs: " << result.gflops << std::endl; + std::cout << "GFLOPS: " << result.gflops << std::endl; std::cout << "Normalized L2 norm of" << std::endl; std::cout.precision(8); std::cout << std::scientific diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu index 7eec75e5c0..6f9d3f3f12 100644 --- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu +++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu @@ -333,7 +333,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs,3xTF32_vs_FP64,1xTF32_vs_FP64,FP32_vs_FP64"; + out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS,3xTF32_vs_FP64,1xTF32_vs_FP64,FP32_vs_FP64"; return out; } @@ -559,7 +559,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu index 2b7d7bef2e..6e979a969a 100644 --- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu +++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu @@ -472,7 +472,7 @@ bool run(Options &options) { return false; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.m = problem_size.m(); result.n = problem_size.n(); result.k = problem_size.k(); @@ -603,7 +603,7 @@ bool run(Options &options) { std::cout.precision(4); std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl; std::cout.precision(2); - std::cout << "GFLOPs: " << result.gflops << std::endl; + std::cout << "GFLOPS: " << result.gflops << std::endl; std::cout << "Normalized L2 norm of" << std::endl; std::cout.precision(8); std::cout << std::scientific diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu index d1f7417f4d..d6e2a35fb8 100644 --- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu +++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu @@ -364,7 +364,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS"; return out; } @@ -674,7 +674,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/34_transposed_conv2d/34_transposed_conv2d.cu b/examples/34_transposed_conv2d/34_transposed_conv2d.cu index f3be99b9c9..f99eb4b30b 100644 --- a/examples/34_transposed_conv2d/34_transposed_conv2d.cu +++ b/examples/34_transposed_conv2d/34_transposed_conv2d.cu @@ -302,7 +302,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS"; return out; } @@ -574,7 +574,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/35_gemm_softmax/gemm_softmax.cu b/examples/35_gemm_softmax/gemm_softmax.cu index c07b1ea9a8..11aa20b370 100644 --- a/examples/35_gemm_softmax/gemm_softmax.cu +++ b/examples/35_gemm_softmax/gemm_softmax.cu @@ -675,7 +675,7 @@ struct Testbed { std::cout << " Runtime: " << elapsed_ms_per_iter << " ms\n" << std::endl; - std::cout << " GFLOPs: " << gflops_per_second << " GFLOPs" << std::endl; + std::cout << " GFLOPS: " << gflops_per_second << " GFLOPS" << std::endl; std::cout << "Memory bandwidth: " << gbytes_per_second << " GiB/s" << std::endl; return true; diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu index 016102c71b..ab28835b58 100644 --- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu +++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu @@ -489,7 +489,7 @@ int run(Options &options) { return -1; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -499,7 +499,7 @@ int run(Options &options) { } std::cout << "Runtime: " << result.runtime_ms << " ms\n"; - std::cout << " GFLOPs: " << result.gflops << "\n"; + std::cout << " GFLOPS: " << result.gflops << "\n"; return 0; } diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu index 95bda9903c..65c905757c 100644 --- a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu +++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu @@ -885,7 +885,7 @@ struct Testbed { << std::endl; std::cout << " Runtime / iteration: " << elapsed_ms_per_iter << " ms\n" << std::endl; - std::cout << " GFLOPs: " << gflops_per_second << " GFLOPs" << std::endl; + std::cout << " GFLOPS: " << gflops_per_second << " GFLOPS" << std::endl; return true; } diff --git a/examples/38_syr2k_grouped/syr2k_grouped.cu b/examples/38_syr2k_grouped/syr2k_grouped.cu index 168f99e499..1951131a68 100644 --- a/examples/38_syr2k_grouped/syr2k_grouped.cu +++ b/examples/38_syr2k_grouped/syr2k_grouped.cu @@ -242,7 +242,7 @@ struct Options { output_file.open(output_path.c_str(), open_mode); if (output_file.good() && open_mode != std::ios_base::app) { - output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n"; + output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPS\n"; } } @@ -994,7 +994,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(this->options.iterations); result.gflops = this->options.gflops(result.runtime_ms / 1000.0); @@ -1246,7 +1246,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(this->options.iterations); result.gflops = this->options.gflops(result.runtime_ms / 1000.0); @@ -1279,7 +1279,7 @@ public: std::cout << std::endl; std::cout << " " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << "Grouped GFLOPs: " << result.gflops << std::endl; + std::cout << " " << "Grouped GFLOPS: " << result.gflops << std::endl; if (this->options.profile_initialization) { std::cout << " " << "Init Runtime: " << result.initialization_time_ms << " ms" << std::endl; } diff --git a/examples/39_gemm_permute/gemm_permute.cu b/examples/39_gemm_permute/gemm_permute.cu index 40540a1b4d..05014ddb7f 100644 --- a/examples/39_gemm_permute/gemm_permute.cu +++ b/examples/39_gemm_permute/gemm_permute.cu @@ -755,7 +755,7 @@ public: float runtime_total_ms = 0; CHECK_CUDA_CALL(cudaEventElapsedTime(&runtime_total_ms, events[0], events[1]), return false); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. double runtime_avg_ms = double(runtime_total_ms) / double(options.iterations); double gflops = options.gflops(runtime_avg_ms / 1000.0, kBatched); @@ -765,7 +765,7 @@ public: } std::cout << " Runtime: " << runtime_avg_ms << " ms\n" - " GFLOPs: " << gflops << std::endl; + " GFLOPS: " << gflops << std::endl; return true; } diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu index 5dad08d29e..1289b2d654 100644 --- a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu +++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu @@ -968,7 +968,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -988,7 +988,7 @@ public: << ", " << options.batch_size << "}." << std::endl; std::cout << std::endl; std::cout << " " << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << "GFLOPs: " << result.gflops << std::endl; + std::cout << " " << "GFLOPS: " << result.gflops << std::endl; return result; } diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu index 6fbc7bc0bf..f3124990fd 100644 --- a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu +++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu @@ -1027,7 +1027,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(this->options.iterations); result.gflops = this->options.gflops(result.runtime_ms / 1000.0); @@ -1048,7 +1048,7 @@ public: options.print_problems(); std::cout << std::endl; std::cout << " " << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << "GFLOPs: " << result.gflops << std::endl; + std::cout << " " << "GFLOPS: " << result.gflops << std::endl; return result; } diff --git a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu index 0e773ee591..a80ad35408 100644 --- a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu +++ b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu @@ -370,7 +370,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,G,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,G,Runtime,GFLOPS"; return out; } @@ -615,7 +615,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu index 7627f737d6..0cbc631454 100644 --- a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu +++ b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu @@ -608,7 +608,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); @@ -626,7 +626,7 @@ public: std::cout << std::endl; std::cout << " " << "Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << " GFLOPs: " << result.gflops << std::endl; + std::cout << " " << " GFLOPS: " << result.gflops << std::endl; return result; } diff --git a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu index 23b30285ba..5245cfa472 100644 --- a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu +++ b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu @@ -379,7 +379,7 @@ struct Result { out << "Name,"; } - out << "Layer,N,H,W,C,K,R,S,G,stride_h,stride_w,dilation_h,dilation_w,splitK,Runtime,GFLOPs"; + out << "Layer,N,H,W,C,K,R,S,G,stride_h,stride_w,dilation_h,dilation_w,splitK,Runtime,GFLOPS"; return out; } @@ -626,7 +626,7 @@ Result profile_convolution(Options const &options) { return result; } - // Print average runtime and GFLOPs. + // Print average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(options.iterations); result.gflops = options.gflops(result.runtime_ms / 1000.0); diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu index 1f4f4312b1..8df89615cc 100644 --- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu +++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu @@ -63,29 +63,29 @@ Basic data-parallel GEMM Disposition: Passed Avg runtime: 0.112633 ms - GFLOPs: 152530 + GFLOPS: 152530 StreamK GEMM with default load-balancing Disposition: Passed Avg runtime: 0.0941929 ms - GFLOPs: 182390 + GFLOPS: 182390 Speedup vs Basic-DP: 1.196 StreamK emulating basic data-parallel GEMM Disposition: Passed Avg runtime: 0.113119 ms - GFLOPs: 151875 + GFLOPS: 151875 Speedup vs Basic-DP: 0.996 Basic split-K GEMM with tile-splitting factor 2 Disposition: Passed Avg runtime: 0.104772 ms - GFLOPs: 163973 + GFLOPS: 163973 StreamK emulating Split-K GEMM with tile-splitting factor 2 Disposition: Passed Avg runtime: 0.105379 ms - GFLOPs: 163029 + GFLOPS: 163029 Speedup vs Basic-SplitK: 0.994 **************************************************************************************************/ @@ -421,13 +421,13 @@ Result run(std::string description, Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; } if (!result.passed) { diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu index 1707f08240..f3a8b6454d 100644 --- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu +++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu @@ -532,13 +532,13 @@ Result run(std::string description, Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPs: " << result.gflops << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; } // TODO: uncomment when results match diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu index 3a35cd7197..181d0e2f57 100644 --- a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu +++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu @@ -441,7 +441,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu index 5af4e67ba3..1e4e9f03b6 100644 --- a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu +++ b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu @@ -596,7 +596,7 @@ struct ExampleRunner std::cout << name << ":\n"; std::cout << " Runtime: " << runtime << " ms\n"; - std::cout << " GFLOPs: " << gflops << "\n"; + std::cout << " GFLOPS: " << gflops << "\n"; }; benchmark("Fused", [&](){ run_gemm(gemm); }); diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu index 2d2b719718..2069b13306 100644 --- a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu +++ b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu @@ -717,7 +717,7 @@ private: std::cout << name << ":\n"; std::cout << " Runtime: " << runtime << " ms\n"; - std::cout << " GFLOPs: " << gflops << "\n"; + std::cout << " GFLOPS: " << gflops << "\n"; }; benchmark("Fused GEMM+permute", [&](){ run_gemm(gemm_permute); }); diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu index 8f4b8758fd..c945d3f721 100644 --- a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu +++ b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu @@ -524,7 +524,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp index 98c7440679..dfe52b8e21 100644 --- a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp +++ b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp @@ -166,7 +166,7 @@ void mixed_dtype_profiling( cudaEventDestroy(start); cudaEventDestroy(stop); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size(); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu index b12e75ec42..42bf1473d2 100644 --- a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu +++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu @@ -480,7 +480,7 @@ int run(Options &options) } timer.stop(); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu index eb449e8f0e..65b12e7758 100644 --- a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu +++ b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu @@ -710,7 +710,7 @@ int run(Options &options, bool host_problem_shapes_available = true) } timer.stop(); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); diff --git a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu index 9f60d077be..dfc3db4d94 100644 --- a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu +++ b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu @@ -738,13 +738,13 @@ struct TestbedRunner { return false; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. runtime_ms = runtime_ms / float(options.iterations); float gflops = options.gflops(runtime_ms / 1000.0f); std::cout << "Problem size: " << options.problem_size.m() << 'x' << options.problem_size.n() << 'x' << options.problem_size.k() << std::endl; std::cout << "Runtime (ms): " << runtime_ms << std::endl; - std::cout << "GFLOPs/sec: " << gflops << std::endl; + std::cout << "GFLOPS/sec: " << gflops << std::endl; // Cleanup for (auto event : events) { diff --git a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu index 62da02c00a..932ed82c82 100644 --- a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu +++ b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu @@ -472,7 +472,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu index da057e2d5c..0058a86f05 100644 --- a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu +++ b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu @@ -502,7 +502,7 @@ struct Runner } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); double avg_runtime_ms = double(elapsed_ms) / double(options.iterations); double gflops = options.gflops(avg_runtime_ms / 1000.0); diff --git a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu index 9fcb9dee5d..66fa4c7adf 100644 --- a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu +++ b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu @@ -434,7 +434,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); double avg_runtime_s = (double)(result.avg_runtime_ms / 1000.0); diff --git a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu index 9bed32f5ac..3c9bc51b98 100644 --- a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu +++ b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu @@ -272,7 +272,7 @@ struct Options { output_file.open(output_path.c_str(), open_mode); if (output_file.good() && open_mode != std::ios_base::app) { - output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n"; + output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPS\n"; } } @@ -1029,7 +1029,7 @@ public: return result; } - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. result.runtime_ms = double(runtime_ms) / double(this->options.iterations); result.gflops = this->options.gflops(result.runtime_ms / 1000.0); @@ -1062,7 +1062,7 @@ public: std::cout << std::endl; std::cout << " " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl; - std::cout << " " << "Grouped GFLOPs: " << result.gflops << std::endl; + std::cout << " " << "Grouped GFLOPS: " << result.gflops << std::endl; if (this->options.profile_initialization) { std::cout << " " << "Init Runtime: " << result.initialization_time_ms << " ms" << std::endl; } diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu index f080b6c69d..2553d36ab1 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu @@ -685,7 +685,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu index 19e012b009..fd53a93026 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu @@ -720,7 +720,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu index 94eb55321f..7f48966ed7 100644 --- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu +++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu @@ -719,7 +719,7 @@ int run(OptionType &options, bool host_problem_shapes_available = true) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu index b5419fe2a1..e1e42701d9 100644 --- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu +++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu @@ -723,7 +723,7 @@ int run(OptionType &options, bool host_problem_shapes_available = true) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu index f9d1421542..93eb05cdb2 100644 --- a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu +++ b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu @@ -420,7 +420,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu index f0b85865ec..a18db03627 100644 --- a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu +++ b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu @@ -607,7 +607,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu index 390012f23f..7e2fe4deeb 100644 --- a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu +++ b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu @@ -477,7 +477,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu index e3ad25feea..6365dbc93a 100644 --- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu +++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu @@ -532,7 +532,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu index e157c6ca75..3ec46e916b 100644 --- a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu +++ b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu @@ -478,7 +478,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu index df805051c6..fd8fbc6f62 100644 --- a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu +++ b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu @@ -480,7 +480,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu index 31e5c2e0a1..7ed33350a4 100644 --- a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu +++ b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu @@ -524,7 +524,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu index 84c42b9129..79f2f8c2dd 100644 --- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu +++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu @@ -729,7 +729,7 @@ int run(Options &options, bool host_problem_shapes_available = true) } timer.stop(); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu index a18828e229..ce8795eccd 100644 --- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu +++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu @@ -863,7 +863,7 @@ int run(Options &options, bool host_problem_shapes_available = true) } timer.stop(); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); diff --git a/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu b/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu index f548e89000..001bd4da41 100644 --- a/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu +++ b/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu @@ -468,7 +468,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu b/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu index 49da2af67f..36488cd920 100644 --- a/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu +++ b/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu @@ -468,7 +468,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu b/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu index a491bed844..633ae52734 100644 --- a/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu +++ b/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu @@ -464,7 +464,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu index cd4231c043..e8bcd41052 100644 --- a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu +++ b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu @@ -409,7 +409,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu index f2f585b820..736a79c226 100644 --- a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu +++ b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu @@ -475,7 +475,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu index d929823bb0..1dfc4d3fe9 100644 --- a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu +++ b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu @@ -522,7 +522,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu index f50f14d768..842054dd16 100644 --- a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu +++ b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu @@ -475,7 +475,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu index d3ebecd163..3c21d05f5c 100644 --- a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu +++ b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu @@ -849,7 +849,7 @@ int run(Options &options, bool host_problem_shapes_available = true) } timer.stop(); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); diff --git a/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu b/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu index ee679f7040..8a413efaa5 100644 --- a/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu +++ b/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu @@ -497,7 +497,7 @@ int run(Options &options) CUTLASS_CHECK(gemm.run()); } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu b/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu index c19a094897..0e69b04398 100644 --- a/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu +++ b/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu @@ -521,7 +521,7 @@ int run(Options &options) CUTLASS_CHECK(gemm.run()); } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu index 10cfe89d3c..ea97da9e59 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu @@ -515,7 +515,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu index 6d8d1de019..19f36ae0c1 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu @@ -522,7 +522,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu index b43869e7f1..1acd317a74 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu @@ -689,7 +689,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu index 60667cda29..9956b0e7fc 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu @@ -696,7 +696,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu b/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu index d428047219..db48f74477 100644 --- a/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu +++ b/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu @@ -543,7 +543,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu b/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu index 4f1b4f4990..3296826791 100644 --- a/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu +++ b/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu @@ -623,7 +623,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu b/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu index ae9224722b..e24a020f48 100644 --- a/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu +++ b/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu @@ -625,7 +625,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh b/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh index f26e6be824..366006b922 100644 --- a/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh +++ b/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh @@ -169,7 +169,7 @@ void mixed_dtype_profiling( cudaEventDestroy(start); cudaEventDestroy(stop); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size(); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu b/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu index 8a4360a94e..7f52183ebe 100644 --- a/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu +++ b/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu @@ -444,7 +444,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu b/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu index a90d24cc02..5286b21ee4 100644 --- a/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu +++ b/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu @@ -461,7 +461,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu b/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu index 467f814585..c4161b0f72 100644 --- a/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu +++ b/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu @@ -603,7 +603,7 @@ int run(Options &options) { } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu b/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu index d6d7879cad..dc0483f650 100644 --- a/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu +++ b/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu @@ -480,7 +480,7 @@ int run(Options &options) } timer.stop(); - // Compute average runtime and GFLOPs. + // Compute average runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); diff --git a/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu b/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu index 18f592d2a5..4e8a23ef08 100644 --- a/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu +++ b/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu @@ -942,7 +942,7 @@ int run(Options &options, bool host_problem_shapes_available = true) // Free profiling workspace cudaFree(workspace); - // Compute average setup and runtime and GFLOPs. + // Compute average setup and runtime and GFLOPS. float elapsed_ms = timer.elapsed_millis(); result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); diff --git a/examples/91_fp4_gemv/91_fp4_gemv.cu b/examples/91_fp4_gemv/91_fp4_gemv.cu index 65fb2a0f1a..c53b3e59f0 100644 --- a/examples/91_fp4_gemv/91_fp4_gemv.cu +++ b/examples/91_fp4_gemv/91_fp4_gemv.cu @@ -537,7 +537,7 @@ struct TestbedGemvFp4SFDBase << ", batch size: " << gemm_batch << std::endl; std::cout << " Runtime: " << elapsed_ms_per_iter << " ms" << std::endl; - std::cout << " GFLOPs: " << gflops_per_second << " GFLOPs" << std::endl; + std::cout << " GFLOPS: " << gflops_per_second << " GFLOPS" << std::endl; std::cout << "Memory bandwidth: " << gbytes_per_second << " GiB/s" << std::endl; } diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h index 8e939f9710..770ede6e24 100755 --- a/test/unit/gemm/kernel/testbed_gemv.h +++ b/test/unit/gemm/kernel/testbed_gemv.h @@ -286,7 +286,7 @@ void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size, << " x " << problem_size.batch() << std::endl; - std::cout << " GFLOPs: " << gflops_per_sec << std::endl; + std::cout << " GFLOPS: " << gflops_per_sec << std::endl; std::cout << "BW (R/W): " << read_bandwidth << " / " << write_bandwidth << " GB/sec" << std::endl; std::cout << " Runtime: " << avg_runtime << " ms" << std::endl; }