diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
index b58896458a..4be04f37ef 100644
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -404,7 +404,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS";
 
     return out;
   }
@@ -662,7 +662,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
index 4324e68038..636810735b 100644
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@@ -416,7 +416,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -476,7 +476,7 @@ public:
     }
 
     std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << " GFLOPs: " << result.gflops << std::endl;
+    std::cout << " GFLOPS: " << result.gflops << std::endl;
 
     return result;
   }
diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
index aa5a8f02c8..503c6ccdd6 100644
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -477,7 +477,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -537,7 +537,7 @@ public:
     }
 
     std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << " GFLOPs: " << result.gflops << std::endl;
+    std::cout << " GFLOPS: " << result.gflops << std::endl;
 
     return result;
   }
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
index 895b52b14d..b02e200651 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@@ -369,7 +369,7 @@ int run(Options &options) {
     return -1;
   }
 
-  // Compute average runtime and GFLOPs.
+  // Compute average runtime and GFLOPS.
   result.runtime_ms = double(runtime_ms) / double(options.iterations);
   result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -412,7 +412,7 @@ int run(Options &options) {
 
   if (passed) {
     std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << " GFLOPs: " << result.gflops << std::endl;
+    std::cout << " GFLOPS: " << result.gflops << std::endl;
   }
 
   std::cout << (passed ? "Passed" : "Failed") << std::endl;
diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
index 91f6a4bbe0..b8804049ac 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@@ -499,7 +499,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS";
 
     return out;
   }
diff --git a/examples/21_quaternion_gemm/quaternion_gemm.cu b/examples/21_quaternion_gemm/quaternion_gemm.cu
index dd2476a4fe..e8a8325a39 100644
--- a/examples/21_quaternion_gemm/quaternion_gemm.cu
+++ b/examples/21_quaternion_gemm/quaternion_gemm.cu
@@ -378,7 +378,7 @@ int run(Options options) {
     return -1;
   }
 
-  // Compute average runtime and GFLOPs.
+  // Compute average runtime and GFLOPS.
   result.runtime_ms = double(runtime_ms) / double(options.iterations);
   result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -424,7 +424,7 @@ int run(Options options) {
 
   if (passed) {
     std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << " GFLOPs: " << result.gflops << std::endl;
+    std::cout << " GFLOPS: " << result.gflops << std::endl;
   }
 
   std::cout << (passed ? "Passed" : "Failed") << std::endl;
diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu
index 170f978b88..7f68538569 100644
--- a/examples/22_quaternion_conv/quaternion_conv.cu
+++ b/examples/22_quaternion_conv/quaternion_conv.cu
@@ -321,7 +321,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS";
 
     return out;
   }
@@ -577,7 +577,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
index d0b0aa0688..42d3876e07 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
@@ -678,7 +678,7 @@ Result profile(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
 
     // Cleanup
diff --git a/examples/24_gemm_grouped/gemm_grouped.cu b/examples/24_gemm_grouped/gemm_grouped.cu
index 9dbe03b1aa..3b0112c337 100644
--- a/examples/24_gemm_grouped/gemm_grouped.cu
+++ b/examples/24_gemm_grouped/gemm_grouped.cu
@@ -268,7 +268,7 @@ struct Options {
       output_file.open(output_path.c_str(), open_mode);
 
       if (output_file.good() && open_mode != std::ios_base::app) {
-        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
+        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPS\n";
       }
     }
 
@@ -1118,7 +1118,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
     result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
@@ -1139,7 +1139,7 @@ public:
     std::cout << "    " << this->options.problem_bins.size() << " batched GEMMs launched" << std::endl;
     std::cout << std::endl;
     std::cout << "    " << "Batched Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "Batched  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "Batched  GFLOPS: " << result.gflops << std::endl;
 
     std::string provider = "CUTLASS";
 
@@ -1380,7 +1380,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
     result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
@@ -1413,7 +1413,7 @@ public:
 
     std::cout << std::endl;
     std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "Grouped  GFLOPS: " << result.gflops << std::endl;
     if (this->options.profile_initialization) {
       std::cout << "    " << "Init    Runtime: " << result.initialization_time_ms << " ms" << std::endl;
     }
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
index 285a0afab7..b8597b64dc 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
@@ -347,7 +347,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,D,H,W,C,K,T,R,S,Stride_D,Stride_H,Stride_W,Runtime,GFLOPs";
+    out << "Layer,N,D,H,W,C,K,T,R,S,Stride_D,Stride_H,Stride_W,Runtime,GFLOPS";
 
     return out;
   }
@@ -659,7 +659,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
index db33764624..5c6ad4f25e 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
@@ -340,7 +340,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS";
 
     return out;
   }
@@ -651,7 +651,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
index e983e8550c..391c195df9 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
+++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
@@ -337,7 +337,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS";
 
     return out;
   }
@@ -649,7 +649,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
index e30718fefc..3249dd371a 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
@@ -532,7 +532,7 @@ bool run(Options &options) {
     return false;
   }
 
-  // Compute average runtime and GFLOPs.
+  // Compute average runtime and GFLOPS.
   result.m = problem_size.m();
   result.n = problem_size.n();
   result.k = problem_size.k();
@@ -661,7 +661,7 @@ bool run(Options &options) {
   std::cout.precision(4);
   std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
   std::cout.precision(2);
-  std::cout << "GFLOPs: " << result.gflops << std::endl;
+  std::cout << "GFLOPS: " << result.gflops << std::endl;
   std::cout << "Normalized L2 norm of" << std::endl;
   std::cout.precision(8);
   std::cout << std::scientific
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
index 7eec75e5c0..6f9d3f3f12 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
@@ -333,7 +333,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs,3xTF32_vs_FP64,1xTF32_vs_FP64,FP32_vs_FP64";
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPS,3xTF32_vs_FP64,1xTF32_vs_FP64,FP32_vs_FP64";
 
     return out;
   }
@@ -559,7 +559,7 @@ Result profile_convolution(Options const &options) {
     return result;
   }
 
-  // Print average runtime and GFLOPs.
+  // Print average runtime and GFLOPS.
   result.runtime_ms = double(runtime_ms) / double(options.iterations);
   result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
index 2b7d7bef2e..6e979a969a 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
@@ -472,7 +472,7 @@ bool run(Options &options) {
     return false;
   }
 
-  // Compute average runtime and GFLOPs.
+  // Compute average runtime and GFLOPS.
   result.m = problem_size.m();
   result.n = problem_size.n();
   result.k = problem_size.k();
@@ -603,7 +603,7 @@ bool run(Options &options) {
   std::cout.precision(4);
   std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
   std::cout.precision(2);
-  std::cout << "GFLOPs: " << result.gflops << std::endl;
+  std::cout << "GFLOPS: " << result.gflops << std::endl;
   std::cout << "Normalized L2 norm of" << std::endl;
   std::cout.precision(8);
   std::cout << std::scientific
diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
index d1f7417f4d..d6e2a35fb8 100644
--- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu
+++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
@@ -364,7 +364,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS";
 
     return out;
   }
@@ -674,7 +674,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/34_transposed_conv2d/34_transposed_conv2d.cu b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
index f3be99b9c9..f99eb4b30b 100644
--- a/examples/34_transposed_conv2d/34_transposed_conv2d.cu
+++ b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
@@ -302,7 +302,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,Stride_H,Stride_W,Runtime,GFLOPS";
 
     return out;
   }
@@ -574,7 +574,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/35_gemm_softmax/gemm_softmax.cu b/examples/35_gemm_softmax/gemm_softmax.cu
index c07b1ea9a8..11aa20b370 100644
--- a/examples/35_gemm_softmax/gemm_softmax.cu
+++ b/examples/35_gemm_softmax/gemm_softmax.cu
@@ -675,7 +675,7 @@ struct Testbed {
 
     std::cout << "         Runtime: " << elapsed_ms_per_iter << " ms\n" << std::endl;
 
-    std::cout << "          GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
+    std::cout << "          GFLOPS: " << gflops_per_second << "  GFLOPS" << std::endl;
     std::cout << "Memory bandwidth: " << gbytes_per_second << "  GiB/s" << std::endl;
 
     return true;
diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
index 016102c71b..ab28835b58 100644
--- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
+++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
@@ -489,7 +489,7 @@ int run(Options &options) {
     return -1;
   }
 
-  // Compute average runtime and GFLOPs.
+  // Compute average runtime and GFLOPS.
   result.runtime_ms = double(runtime_ms) / double(options.iterations);
   result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -499,7 +499,7 @@ int run(Options &options) {
   }
 
   std::cout << "Runtime: " << result.runtime_ms << " ms\n";
-  std::cout << " GFLOPs: " << result.gflops << "\n";
+  std::cout << " GFLOPS: " << result.gflops << "\n";
 
   return 0;
 }
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
index 95bda9903c..65c905757c 100644
--- a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
@@ -885,7 +885,7 @@ struct Testbed {
               << std::endl;
 
     std::cout << " Runtime / iteration: " << elapsed_ms_per_iter << " ms\n" << std::endl;
-    std::cout << "              GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
+    std::cout << "              GFLOPS: " << gflops_per_second << "  GFLOPS" << std::endl;
 
     return true;
   }
diff --git a/examples/38_syr2k_grouped/syr2k_grouped.cu b/examples/38_syr2k_grouped/syr2k_grouped.cu
index 168f99e499..1951131a68 100644
--- a/examples/38_syr2k_grouped/syr2k_grouped.cu
+++ b/examples/38_syr2k_grouped/syr2k_grouped.cu
@@ -242,7 +242,7 @@ struct Options {
       output_file.open(output_path.c_str(), open_mode);
 
       if (output_file.good() && open_mode != std::ios_base::app) {
-        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
+        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPS\n";
       }
     }
 
@@ -994,7 +994,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
     result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
@@ -1246,7 +1246,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
     result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
@@ -1279,7 +1279,7 @@ public:
 
     std::cout << std::endl;
     std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "Grouped  GFLOPS: " << result.gflops << std::endl;
     if (this->options.profile_initialization) {
       std::cout << "    " << "Init    Runtime: " << result.initialization_time_ms << " ms" << std::endl;
     }
diff --git a/examples/39_gemm_permute/gemm_permute.cu b/examples/39_gemm_permute/gemm_permute.cu
index 40540a1b4d..05014ddb7f 100644
--- a/examples/39_gemm_permute/gemm_permute.cu
+++ b/examples/39_gemm_permute/gemm_permute.cu
@@ -755,7 +755,7 @@ public:
     float runtime_total_ms = 0;
     CHECK_CUDA_CALL(cudaEventElapsedTime(&runtime_total_ms, events[0], events[1]), return false);
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     double runtime_avg_ms = double(runtime_total_ms) / double(options.iterations);
     double gflops = options.gflops(runtime_avg_ms / 1000.0, kBatched);
 
@@ -765,7 +765,7 @@ public:
     }
 
     std::cout << "    Runtime: " << runtime_avg_ms << " ms\n"
-                 "     GFLOPs: " << gflops << std::endl;
+                 "     GFLOPS: " << gflops << std::endl;
 
     return true;
   }
diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
index 5dad08d29e..1289b2d654 100644
--- a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
@@ -968,7 +968,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -988,7 +988,7 @@ public:
       << ", " << options.batch_size << "}." << std::endl;
     std::cout << std::endl;
     std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "GFLOPS: " << result.gflops << std::endl;
 
     return result;
   }
diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
index 6fbc7bc0bf..f3124990fd 100644
--- a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
@@ -1027,7 +1027,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
     result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
@@ -1048,7 +1048,7 @@ public:
     options.print_problems();
     std::cout << std::endl;
     std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "GFLOPS: " << result.gflops << std::endl;
 
     return result;
   }
diff --git a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
index 0e773ee591..a80ad35408 100644
--- a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
+++ b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
@@ -370,7 +370,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,G,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,G,Runtime,GFLOPS";
 
     return out;
   }
@@ -615,7 +615,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
index 7627f737d6..0cbc631454 100644
--- a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
+++ b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
@@ -608,7 +608,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -626,7 +626,7 @@ public:
 
     std::cout << std::endl;
     std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << " GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << " GFLOPS: " << result.gflops << std::endl;
 
     return result;
   }
diff --git a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
index 23b30285ba..5245cfa472 100644
--- a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
+++ b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
@@ -379,7 +379,7 @@ struct Result {
       out << "Name,";
     }
 
-    out << "Layer,N,H,W,C,K,R,S,G,stride_h,stride_w,dilation_h,dilation_w,splitK,Runtime,GFLOPs";
+    out << "Layer,N,H,W,C,K,R,S,G,stride_h,stride_w,dilation_h,dilation_w,splitK,Runtime,GFLOPS";
 
     return out;
   }
@@ -626,7 +626,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
index 1f4f4312b1..8df89615cc 100644
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
@@ -63,29 +63,29 @@
         Basic data-parallel GEMM
           Disposition: Passed
           Avg runtime: 0.112633 ms
-          GFLOPs: 152530
+          GFLOPS: 152530
 
         StreamK GEMM with default load-balancing
           Disposition: Passed
           Avg runtime: 0.0941929 ms
-          GFLOPs: 182390
+          GFLOPS: 182390
           Speedup vs Basic-DP: 1.196
 
         StreamK emulating basic data-parallel GEMM
           Disposition: Passed
           Avg runtime: 0.113119 ms
-          GFLOPs: 151875
+          GFLOPS: 151875
           Speedup vs Basic-DP: 0.996
 
         Basic split-K GEMM with tile-splitting factor 2
           Disposition: Passed
           Avg runtime: 0.104772 ms
-          GFLOPs: 163973
+          GFLOPS: 163973
 
         StreamK emulating Split-K GEMM with tile-splitting factor 2
           Disposition: Passed
           Avg runtime: 0.105379 ms
-          GFLOPs: 163029
+          GFLOPS: 163029
           Speedup vs Basic-SplitK: 0.994
 
  **************************************************************************************************/
@@ -421,13 +421,13 @@ Result run(std::string description, Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
 
     std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
   }
 
   if (!result.passed) {
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
index 1707f08240..f3a8b6454d 100644
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
@@ -532,13 +532,13 @@ Result run(std::string description, Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
 
     std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
   }
 
   // TODO: uncomment when results match
diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
index 3a35cd7197..181d0e2f57 100644
--- a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
+++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
@@ -441,7 +441,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
index 5af4e67ba3..1e4e9f03b6 100644
--- a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
+++ b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
@@ -596,7 +596,7 @@ struct ExampleRunner
 
       std::cout << name << ":\n";
       std::cout << "  Runtime: " << runtime << " ms\n";
-      std::cout << "   GFLOPs: " << gflops  << "\n";
+      std::cout << "   GFLOPS: " << gflops  << "\n";
     };
 
     benchmark("Fused", [&](){ run_gemm(gemm); });
diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
index 2d2b719718..2069b13306 100644
--- a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
+++ b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
@@ -717,7 +717,7 @@ private:
 
       std::cout << name << ":\n";
       std::cout << "  Runtime: " << runtime << " ms\n";
-      std::cout << "   GFLOPs: " << gflops  << "\n";
+      std::cout << "   GFLOPS: " << gflops  << "\n";
     };
 
     benchmark("Fused GEMM+permute", [&](){ run_gemm<false>(gemm_permute); });
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
index 8f4b8758fd..c945d3f721 100644
--- a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
@@ -524,7 +524,7 @@ int run(Options<RasterOrderOptions> &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
index 98c7440679..dfe52b8e21 100644
--- a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
+++ b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
@@ -166,7 +166,7 @@ void mixed_dtype_profiling(
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
 
-  // Compute average setup and runtime and GFLOPs.
+  // Compute average setup and runtime and GFLOPS.
   result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
   result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
 
diff --git a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
index b12e75ec42..42bf1473d2 100644
--- a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
+++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
@@ -480,7 +480,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average setup and runtime and GFLOPs.
+    // Compute average setup and runtime and GFLOPS.
     float elapsed_ms       = timer.elapsed_millis();
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
index eb449e8f0e..65b12e7758 100644
--- a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
+++ b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
@@ -710,7 +710,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     }
     timer.stop();
 
-    // Compute average setup and runtime and GFLOPs.
+    // Compute average setup and runtime and GFLOPS.
     float elapsed_ms       = timer.elapsed_millis();
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
diff --git a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
index 9f60d077be..dfc3db4d94 100644
--- a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
+++ b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
@@ -738,13 +738,13 @@ struct TestbedRunner {
       return false;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     runtime_ms = runtime_ms / float(options.iterations);
     float gflops = options.gflops(runtime_ms / 1000.0f);
 
     std::cout << "Problem size: " << options.problem_size.m() << 'x' << options.problem_size.n() << 'x' << options.problem_size.k() << std::endl;
     std::cout << "Runtime (ms): " << runtime_ms << std::endl;
-    std::cout << "GFLOPs/sec:   " << gflops << std::endl;
+    std::cout << "GFLOPS/sec:   " << gflops << std::endl;
 
     // Cleanup
     for (auto event : events) {
diff --git a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
index 62da02c00a..932ed82c82 100644
--- a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
+++ b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
@@ -472,7 +472,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
index da057e2d5c..0058a86f05 100644
--- a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
+++ b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
@@ -502,7 +502,7 @@ struct Runner
       }
       timer.stop();
 
-      // Compute average runtime and GFLOPs.
+      // Compute average runtime and GFLOPS.
       float elapsed_ms = timer.elapsed_millis();
       double avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
       double gflops = options.gflops(avg_runtime_ms / 1000.0);
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
index 9fcb9dee5d..66fa4c7adf 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
+++ b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
@@ -434,7 +434,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     double avg_runtime_s = (double)(result.avg_runtime_ms / 1000.0);
diff --git a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
index 9bed32f5ac..3c9bc51b98 100644
--- a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
+++ b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
@@ -272,7 +272,7 @@ struct Options {
       output_file.open(output_path.c_str(), open_mode);
 
       if (output_file.good() && open_mode != std::ios_base::app) {
-        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
+        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPS\n";
       }
     }
 
@@ -1029,7 +1029,7 @@ public:
       return result;
     }
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
     result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
@@ -1062,7 +1062,7 @@ public:
 
     std::cout << std::endl;
     std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "Grouped  GFLOPS: " << result.gflops << std::endl;
     if (this->options.profile_initialization) {
       std::cout << "    " << "Init    Runtime: " << result.initialization_time_ms << " ms" << std::endl;
     }
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
index f080b6c69d..2553d36ab1 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -685,7 +685,7 @@ int run(Options<RasterOrderOptions> &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
index 19e012b009..fd53a93026 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -720,7 +720,7 @@ int run(Options<RasterOrderOptions> &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu
index 94eb55321f..7f48966ed7 100644
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling.cu
@@ -719,7 +719,7 @@ int run(OptionType &options, bool host_problem_shapes_available = true)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
index b5419fe2a1..e1e42701d9 100644
--- a/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
+++ b/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling_with_sparse_groups.cu
@@ -723,7 +723,7 @@ int run(OptionType &options, bool host_problem_shapes_available = true)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
index f9d1421542..93eb05cdb2 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
@@ -420,7 +420,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
index f0b85865ec..a18db03627 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
@@ -607,7 +607,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
index 390012f23f..7e2fe4deeb 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
@@ -477,7 +477,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
index e3ad25feea..6365dbc93a 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
@@ -532,7 +532,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
index e157c6ca75..3ec46e916b 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
@@ -478,7 +478,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
index df805051c6..fd8fbc6f62 100644
--- a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
+++ b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
@@ -480,7 +480,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
index 31e5c2e0a1..7ed33350a4 100644
--- a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
+++ b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
@@ -524,7 +524,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
index 84c42b9129..79f2f8c2dd 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
@@ -729,7 +729,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     }
     timer.stop();
 
-    // Compute average setup and runtime and GFLOPs.
+    // Compute average setup and runtime and GFLOPS.
     float elapsed_ms       = timer.elapsed_millis();
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
index a18828e229..ce8795eccd 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
@@ -863,7 +863,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     }
     timer.stop();
 
-    // Compute average setup and runtime and GFLOPs.
+    // Compute average setup and runtime and GFLOPS.
     float elapsed_ms       = timer.elapsed_millis();
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
diff --git a/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu b/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu
index f548e89000..001bd4da41 100644
--- a/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu
+++ b/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu
@@ -468,7 +468,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu b/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu
index 49da2af67f..36488cd920 100644
--- a/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu
+++ b/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu
@@ -468,7 +468,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu b/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu
index a491bed844..633ae52734 100644
--- a/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu
+++ b/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu
@@ -464,7 +464,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
index cd4231c043..e8bcd41052 100644
--- a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
+++ b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
@@ -409,7 +409,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
index f2f585b820..736a79c226 100644
--- a/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu
@@ -475,7 +475,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
index d929823bb0..1dfc4d3fe9 100644
--- a/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu
@@ -522,7 +522,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
index f50f14d768..842054dd16 100644
--- a/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu
@@ -475,7 +475,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
index d3ebecd163..3c21d05f5c 100644
--- a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
@@ -849,7 +849,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     }
     timer.stop();
 
-    // Compute average setup and runtime and GFLOPs.
+    // Compute average setup and runtime and GFLOPS.
     float elapsed_ms       = timer.elapsed_millis();
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
diff --git a/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu b/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
index ee679f7040..8a413efaa5 100644
--- a/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
+++ b/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
@@ -497,7 +497,7 @@ int run(Options &options)
       CUTLASS_CHECK(gemm.run());
     }
     timer.stop();
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu b/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu
index c19a094897..0e69b04398 100644
--- a/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu
+++ b/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu
@@ -521,7 +521,7 @@ int run(Options &options)
       CUTLASS_CHECK(gemm.run());
     }
     timer.stop();
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
index 10cfe89d3c..ea97da9e59 100644
--- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
+++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
@@ -515,7 +515,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu
index 6d8d1de019..19f36ae0c1 100644
--- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu
+++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu
@@ -522,7 +522,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu
index b43869e7f1..1acd317a74 100644
--- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu
+++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu
@@ -689,7 +689,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu
index 60667cda29..9956b0e7fc 100644
--- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu
+++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu
@@ -696,7 +696,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu b/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu
index d428047219..db48f74477 100644
--- a/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu
+++ b/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu
@@ -543,7 +543,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu b/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu
index 4f1b4f4990..3296826791 100644
--- a/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu
+++ b/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu
@@ -623,7 +623,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu b/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu
index ae9224722b..e24a020f48 100644
--- a/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu
+++ b/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu
@@ -625,7 +625,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh b/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
index f26e6be824..366006b922 100644
--- a/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
+++ b/examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
@@ -169,7 +169,7 @@ void mixed_dtype_profiling(
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
 
-  // Compute average setup and runtime and GFLOPs.
+  // Compute average setup and runtime and GFLOPS.
   result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
   result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
 
diff --git a/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu b/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
index 8a4360a94e..7f52183ebe 100644
--- a/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
+++ b/examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
@@ -444,7 +444,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu b/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
index a90d24cc02..5286b21ee4 100644
--- a/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
+++ b/examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
@@ -461,7 +461,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu b/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
index 467f814585..c4161b0f72 100644
--- a/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
+++ b/examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
@@ -603,7 +603,7 @@ int run(Options &options) {
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu b/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
index d6d7879cad..dc0483f650 100644
--- a/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
+++ b/examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
@@ -480,7 +480,7 @@ int run(Options &options)
     }
     timer.stop();
 
-    // Compute average runtime and GFLOPs.
+    // Compute average runtime and GFLOPS.
     float elapsed_ms = timer.elapsed_millis();
     result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
     result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
diff --git a/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu b/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu
index 18f592d2a5..4e8a23ef08 100644
--- a/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu
+++ b/examples/90_sm103_fp4_ultra_grouped_gemm/90_sm103_fp4_ultra_grouped_gemm.cu
@@ -942,7 +942,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     // Free profiling workspace
     cudaFree(workspace);
 
-    // Compute average setup and runtime and GFLOPs.
+    // Compute average setup and runtime and GFLOPS.
     float elapsed_ms       = timer.elapsed_millis();
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
diff --git a/examples/91_fp4_gemv/91_fp4_gemv.cu b/examples/91_fp4_gemv/91_fp4_gemv.cu
index 65fb2a0f1a..c53b3e59f0 100644
--- a/examples/91_fp4_gemv/91_fp4_gemv.cu
+++ b/examples/91_fp4_gemv/91_fp4_gemv.cu
@@ -537,7 +537,7 @@ struct TestbedGemvFp4SFDBase
                 << ", batch size: " << gemm_batch
                 << std::endl;
       std::cout << "         Runtime: " << elapsed_ms_per_iter << " ms" << std::endl;
-      std::cout << "          GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
+      std::cout << "          GFLOPS: " << gflops_per_second << "  GFLOPS" << std::endl;
       std::cout << "Memory bandwidth: " << gbytes_per_second << "  GiB/s" << std::endl;
 
     }
diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h
index 8e939f9710..770ede6e24 100755
--- a/test/unit/gemm/kernel/testbed_gemv.h
+++ b/test/unit/gemm/kernel/testbed_gemv.h
@@ -286,7 +286,7 @@ void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size,
                   << " x " << problem_size.batch() 
                   << std::endl;
 
-        std::cout << "  GFLOPs:     " << gflops_per_sec << std::endl;
+        std::cout << "  GFLOPS:     " << gflops_per_sec << std::endl;
         std::cout << "BW (R/W):     " << read_bandwidth << " / " << write_bandwidth << " GB/sec" << std::endl;
         std::cout << " Runtime:     " << avg_runtime << " ms" << std::endl;
     }