From 216c8ba5ae2b9bac34f87f38a63cecb19727b7a4 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 3 Sep 2025 15:23:36 +0800 Subject: [PATCH] fix typos in examples --- examples/00_basic_gemm/basic_gemm.cu | 2 +- examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu | 2 +- examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu | 2 +- .../turing_tensorop_conv2dfprop.cu | 2 +- examples/13_two_tensor_op_fusion/b2b_conv2d_run.h | 2 +- examples/13_two_tensor_op_fusion/b2b_gemm_run.h | 2 +- examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h | 2 +- examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h | 2 +- examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h | 2 +- examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h | 2 +- .../ampere_fp64_tensorop_affine2_gemm.cu | 2 +- examples/39_gemm_permute/layouts.h | 2 +- examples/41_fused_multi_head_attention/gemm/custom_mma_base.h | 2 +- examples/41_fused_multi_head_attention/gemm/mma_from_smem.h | 2 +- .../iterators/epilogue_predicated_tile_iterator.h | 2 +- examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py | 2 +- examples/45_dual_gemm/threadblock/dual_mma_base.h | 2 +- .../49_collective_builder.cu | 2 +- examples/51_hopper_gett/51_hopper_gett.cu | 2 +- examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu | 2 +- examples/59_ampere_gather_scatter_conv/README.md | 4 ++-- ...hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu | 4 ++-- ...hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu | 4 ++-- examples/77_blackwell_fmha/README.md | 2 +- .../sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp | 2 +- .../collective/sm100_fmha_load_tma_warpspecialized.hpp | 4 ++-- .../collective/sm100_fmha_mla_load_tma_warpspecialized.hpp | 4 ++-- examples/77_blackwell_fmha/common/pipeline_mla.hpp | 2 +- .../kernel/sm100_fmha_mla_tma_warpspecialized.hpp | 2 +- examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp | 2 +- .../81_blackwell_gemm_blockwise.cu | 4 ++-- .../81_blackwell_gemm_groupwise.cu | 4 ++-- .../81_blackwell_grouped_gemm_blockwise.cu | 4 ++-- .../81_blackwell_grouped_gemm_groupwise.cu | 4 ++-- examples/81_blackwell_gemm_blockwise/README.md | 4 ++-- examples/87_blackwell_geforce_gemm_blockwise/utils.h | 2 +- examples/README.md | 2 +- examples/cute/tutorial/tiled_copy_if.cu | 4 ++-- examples/python/CuTeDSL/ampere/smem_allocator.py | 2 +- examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py | 2 +- examples/python/CuTeDSL/hopper/dense_gemm.py | 2 +- examples/python/deprecated/00_basic_gemm.ipynb | 4 ++-- examples/python/deprecated/01_epilogue.ipynb | 2 +- examples/python/deprecated/03_basic_conv2d.ipynb | 2 +- examples/python/deprecated/04_epilogue_visitor.ipynb | 4 ++-- 45 files changed, 58 insertions(+), 58 deletions(-) diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index df8009e0ea..5d1c0c15ea 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -47,7 +47,7 @@ or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are prevalent in the CUTLASS unit tests. - This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to + This example has deliberately been kept similar to the basic_gemm example from cutlass-1.3 to highlight the minimum amount of differences needed to transition to cutlass-2.0. Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu index 8bad0bbdaa..37cbc1a368 100644 --- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu +++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu @@ -45,7 +45,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute threadblock-tile (tile size computed by a threadblock). -In thie example, we split variable initialization into +In this example, we split variable initialization into 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel can view them (logical to physical mapping) 2. Setting up computation properties : describes how the above set matrices will be used to compute diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index 70afef7e59..da157470fd 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -45,7 +45,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute threadblock-tile (tile size computed by a threadblock). -In thie example, we split variable initialization into +In this example, we split variable initialization into 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel can view them (logical to physical mapping) 2. Setting up computation properties : describes how the above set matrices will be used to compute diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index b58896458a..8d510ebb5c 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -47,7 +47,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute threadblock-tile (tile size computed by a threadblock). -In thie example, we split variable initialization into +In this example, we split variable initialization into 1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel can view them (logical to physical mapping) 2. Setting up computation properties : describes how the above set tensors will be used to compute diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h index df4cb76ad1..e9aac97abf 100644 --- a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h @@ -560,7 +560,7 @@ class B2bFusedConv2dRun { if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.N*P*Q = problem_size_1.N*P*Q\n" << " problem_size_0.K = problem_size_1.C\n" << " problem_size_1.R = problem_size_1.S = 1\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h index f0e85cda3a..6391f41dd5 100644 --- a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h @@ -604,7 +604,7 @@ struct B2bFusedGemmRun if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.M = problem_size_1.M\n" << " problem_size_0.N = problem_size_1.K\n" << " ThreadblockShape0::kN = problem_size_0.N\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h index b6267a153b..ed88eb67cb 100644 --- a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h @@ -302,7 +302,7 @@ struct B2bFusedGroupedGemmRun if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.M = problem_size_1.M\n" << " problem_size_0.N = problem_size_1.K\n" << " ThreadblockShape0::kN = problem_size_0.N\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h index 4693e86423..8221273869 100644 --- a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h @@ -586,7 +586,7 @@ class B2bInterleavedFusedConv2dRun { if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.N*P*Q = problem_size_1.N*P*Q\n" << " problem_size_0.K = problem_size_1.C\n" << " problem_size_1.R = problem_size_1.S = 1\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h index 453f44cd0c..a8b627a037 100644 --- a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h @@ -638,7 +638,7 @@ struct B2bInterleavedFusedGemmRun if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.M = problem_size_1.M\n" << " problem_size_0.N = problem_size_1.K\n" << " ThreadblockShape0::kN = problem_size_0.N\n" diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h index c845f2023f..139fa90a9e 100644 --- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h @@ -97,7 +97,7 @@ class B2bMmaBase { Shape1::kN / WarpGemm1::kN, Shape1::kK / WarpGemm1::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations0 = (WarpGemm0::kK / Operator0::Policy::MmaShape::kK); static int const kWarpGemmIterations1 = diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu index 8e0094f69a..90f5fe7099 100644 --- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu +++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu @@ -41,7 +41,7 @@ change to method to visit the global memory: address addr = base_pointer + coord1 * stride1 + coord2 * stride2 -The rest part of GEMM which includes shared memory load/store, mma comutation +The rest part of GEMM which includes shared memory load/store, mma computation is the same. This example uses Ampere fp64 tensore core Affine2 GEMM as an example. SIMT diff --git a/examples/39_gemm_permute/layouts.h b/examples/39_gemm_permute/layouts.h index d4d9ed3166..7b277bc259 100644 --- a/examples/39_gemm_permute/layouts.h +++ b/examples/39_gemm_permute/layouts.h @@ -71,7 +71,7 @@ class TensorCWHN { // Data members // - /// Stride data member - [n, hn, whn] + /// Stride data member - [n, hn, when] Stride stride_; public: diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h index 66c099d15b..f3c26fb986 100644 --- a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h +++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h @@ -87,7 +87,7 @@ class CustomMmaBase { Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); diff --git a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h index f2b94d0031..eb1917d206 100644 --- a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h +++ b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h @@ -167,7 +167,7 @@ class MmaBaseFromSharedMemory { Shape::kK / WarpGemm::kK>; using WarpCount1 = WarpCount; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); static int const kWarpGemmIterations1 = kWarpGemmIterations; diff --git a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h index 7a52e96a36..b3e9e1a2fb 100644 --- a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h +++ b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h @@ -67,7 +67,7 @@ namespace threadblock { /// ForwardTileIterator /// template < - typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename ThreadMap_, ///< Thread map (concept: OutputTileThreadMap) typename Element_, ///< Element data type bool ScatterD = false, ///< Scatter D operand or not bool UseCUDAStore = false> diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py index 6cd01ef16b..081aaaca75 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py @@ -45,7 +45,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep self.b2b_num = len(fuse_gemm_info) self.user_header_file = user_header_file self.args = {} - # device arg struct memebr + # device arg struct member self.arg_member = [] self.gen_class_name = gen_class_name self.gen_kernel_name = gen_class_name + "Kernel" diff --git a/examples/45_dual_gemm/threadblock/dual_mma_base.h b/examples/45_dual_gemm/threadblock/dual_mma_base.h index 754719033e..8aee22abb5 100644 --- a/examples/45_dual_gemm/threadblock/dual_mma_base.h +++ b/examples/45_dual_gemm/threadblock/dual_mma_base.h @@ -91,7 +91,7 @@ class DualMmaBase { Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator0::Policy::MmaShape::kK); diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu index d5758aa219..98dd043a3f 100644 --- a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu +++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu @@ -238,7 +238,7 @@ bool initialize_block( // to select an appropriate value on its own. The CollectiveBuilder will attempt to select // configurations that will result in the most-performant kernel, but this is not a guarantee. // -// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatiblity. +// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatibility. // For example, if `KernelScheduleAuto` is used for the mainloop builder, `EpilogueScheduleAuto` must // be used for the epilogue builder. // diff --git a/examples/51_hopper_gett/51_hopper_gett.cu b/examples/51_hopper_gett/51_hopper_gett.cu index f2eb5c0bd3..c0df9029fe 100644 --- a/examples/51_hopper_gett/51_hopper_gett.cu +++ b/examples/51_hopper_gett/51_hopper_gett.cu @@ -58,7 +58,7 @@ In the following example, we illustrate how every Hopper GEMM in CUTLASS 3.0 is a GETT in disguise. We begin by defining the four modes detailed above as Row, Col (column), Red (reduction), and Bat (batch) strides, which we then nest for each of the in/out tensors to create our rank-3 stride - tuples. Note that although we do not define the problem shape type explicitely, it too remains a + tuples. Note that although we do not define the problem shape type explicitly, it too remains a rank-4 shape tuple just like any other batched GEMM, but instead with multi-mode shapes for each of the four corresponding multi-modes within it. After this, the same CollectiveMma and CollectiveBuilder we describe in examples 50 and 49 are used to create our kernel type. Nothing diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu index 2d2b719718..1ac54109b8 100644 --- a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu +++ b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu @@ -180,7 +180,7 @@ struct Options { " --alpha= GEMM alpha parameter\n" " --beta= GEMM beta parameter\n" " --iterations= Number of profiling iterations to perform.\n" - " --check= Validate results against a reference (unfused) imlementation" + " --check= Validate results against a reference (unfused) implementation" " --verbose= Enable verbose output" "\n" "Examples:\n" diff --git a/examples/59_ampere_gather_scatter_conv/README.md b/examples/59_ampere_gather_scatter_conv/README.md index b16ddf9572..1e5ce8f9c0 100644 --- a/examples/59_ampere_gather_scatter_conv/README.md +++ b/examples/59_ampere_gather_scatter_conv/README.md @@ -85,8 +85,8 @@ Now that we have transformed our problem in such a way that allows us to dispatc we can reuse much of the machinery CUTLASS offers to implement this forward pass convolution operator. CUTLASS decomposes these "moving parts" of GPU linear algebra into reusable, modular software components abstracted by C++ template classes. This example -demonstrates how some of the lower layers of the hierarchy can be re-used for custom kernels -by writing a custom kernel for convolution that re-uses the Ampere/Ada GEMM collectives +demonstrates how some of the lower layers of the hierarchy can be reused for custom kernels +by writing a custom kernel for convolution that reuses the Ampere/Ada GEMM collectives from CUTLASS 3. A kernel author is free to compose their custom components with any of the existing templates diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu index 9e55755bc8..b184da4dba 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu @@ -302,7 +302,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -340,7 +340,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu index ab88f54d9d..21f04fef68 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu @@ -312,7 +312,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -350,7 +350,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/77_blackwell_fmha/README.md b/examples/77_blackwell_fmha/README.md index 1e28929b5b..506d4f5865 100644 --- a/examples/77_blackwell_fmha/README.md +++ b/examples/77_blackwell_fmha/README.md @@ -8,7 +8,7 @@ For generation usage, use an M-blocking (Num-Groups) of 128 (although the limit Context loads are done via TMA, whereas generation usage utilized `cp.async` and is thus more amenable to complex load patterns. -For variable sequence lenght, the code requires a batch of valid (but never used) padding memory ahead of the first input batch. This is achieved with least overhead by leaving one batch free and then arranging QKV consecutively. +For variable sequence length, the code requires a batch of valid (but never used) padding memory ahead of the first input batch. This is achieved with least overhead by leaving one batch free and then arranging QKV consecutively. The approach of this implementation is to reuse the selection logic of the collective gemm builder and recombine the result into an FMHA kernel. The kernel and collective layer are then formulated to be fmha-specific. diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp index 616357cb0e..008cfd1ca8 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp @@ -118,7 +118,7 @@ struct Sm100FmhaFwdEpilogueTmaWarpspecialized { auto cumulative_length_q = get<0>(problem_shape).cumulative_length; if (cumulative_length_q != nullptr) { int max_length_q = get<0>(problem_shape).max_length; - // for variable sequence lenght, the batch is in units of row_stride + // for variable sequence length, the batch is in units of row_stride get<2,1>(dO) = get<0>(dO); get<2,1>(problem_shape_O) = max_length_q * (1 + get<2,1>(problem_shape_O)); // offset ptr by the amount we add back in later diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_load_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_load_tma_warpspecialized.hpp index 1951056b2c..514a2c0d35 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_load_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_load_tma_warpspecialized.hpp @@ -101,7 +101,7 @@ struct Sm100FmhaLoadTmaWarpspecialized { auto cumulative_length_q = get<0>(problem_shape).cumulative_length; if (cumulative_length_q != nullptr) { int max_length_q = get<0>(problem_shape).max_length; - // for variable sequence lenght, the batch is in units of row_stride + // for variable sequence length, the batch is in units of row_stride get<2,1>(dQ) = get<0>(dQ); get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_q * (1 + get<3,1>(problem_shape))); // offset ptr by the amount we add back in later @@ -113,7 +113,7 @@ struct Sm100FmhaLoadTmaWarpspecialized { auto cumulative_length_kv = get<1>(problem_shape).cumulative_length; if (cumulative_length_kv != nullptr) { int max_length_kv = get<1>(problem_shape).max_length; - // for variable sequence lenght, the batch is in units of row_stride + // for variable sequence length, the batch is in units of row_stride get<2,1>(dK) = get<0>(dK); get<2,1>(dV) = get<0>(dV); get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_kv * (1 + get<3,1>(problem_shape))); diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp index c2d3e2ba6b..c8b3d978db 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp @@ -108,7 +108,7 @@ struct Sm100MlaFwdLoadTmaWarpspecialized { auto cumulative_length_q = get<0>(problem_shape).cumulative_length; if (cumulative_length_q != nullptr) { int max_length_q = get<0>(problem_shape).max_length; - // for variable sequence lenght, the batch is in units of row_stride + // for variable sequence length, the batch is in units of row_stride get<2,1>(dQ) = get<0>(dQ); get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_q * (1 + get<3,1>(problem_shape))); // offset ptr by the amount we add back in later @@ -120,7 +120,7 @@ struct Sm100MlaFwdLoadTmaWarpspecialized { auto cumulative_length_kv = get<1>(problem_shape).cumulative_length; if (cumulative_length_kv != nullptr) { int max_length_kv = get<1>(problem_shape).max_length; - // for variable sequence lenght, the batch is in units of row_stride + // for variable sequence length, the batch is in units of row_stride get<2,1>(dK) = get<0>(dK); get<2,1>(dV) = get<0>(dV); get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_kv * (1 + get<3,1>(problem_shape))); diff --git a/examples/77_blackwell_fmha/common/pipeline_mla.hpp b/examples/77_blackwell_fmha/common/pipeline_mla.hpp index 5bbeed9106..b9d05c5662 100644 --- a/examples/77_blackwell_fmha/common/pipeline_mla.hpp +++ b/examples/77_blackwell_fmha/common/pipeline_mla.hpp @@ -224,7 +224,7 @@ class PipelineTmaAsyncMla { static constexpr bool is_2sm_mma = size(AtomThrShape_MNK{}) > 1; // Consumer signalling Producer of completion - // Ensures all blocks in the Same Row and Column get notifed. + // Ensures all blocks in the Same Row and Column get notified. CUTLASS_DEVICE void consumer_release(uint32_t stage, uint32_t skip) { detail::pipeline_check_is_consumer(params_.role); diff --git a/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp index 5eb8e20b76..6eb512d025 100644 --- a/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp @@ -435,7 +435,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { size_t workspace_size {0}; if (args.is_fused_reduction && args.split_kv > 1) { // one exchange buffer for LSE max and another buffer for total LSE - // two locks per batch, frist lock is for CTA0 / H=0..63 and the second is for CTA1 / H=64..127 + // two locks per batch, first lock is for CTA0 / H=0..63 and the second is for CTA1 / H=64..127 workspace_size = H * B * (sizeof(int) + sizeof(ElementLSE)) + 2 * B * sizeof(int); } else if (!args.is_fused_reduction && args.split_kv > 1) { workspace_size = (sizeof(ElementAcc) * D_latent + sizeof(ElementLSE)) * H * split_kv * B; diff --git a/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp b/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp index c83ebdb747..9897627546 100644 --- a/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp +++ b/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp @@ -186,7 +186,7 @@ void fmha_mla_reference( shared_mem); if (cudaSuccess != result) { result = cudaGetLastError(); // to clear the error bit - throw std::runtime_error("couldn't perform smem optin"); + throw std::runtime_error("couldn't perform smem option"); } } fmha_mla_reference_kernel<<>>( diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu index 10cfe89d3c..f9ff4223da 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu @@ -301,7 +301,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -339,7 +339,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu index 6d8d1de019..862ef680f7 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu @@ -308,7 +308,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -346,7 +346,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu index b43869e7f1..60010aa0be 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu @@ -349,7 +349,7 @@ bool initialize_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -390,7 +390,7 @@ bool initialize_scale_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu index 60667cda29..117661cc80 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu @@ -356,7 +356,7 @@ bool initialize_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -397,7 +397,7 @@ bool initialize_scale_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/README.md b/examples/81_blackwell_gemm_blockwise/README.md index 9fe03bab7a..add3469508 100644 --- a/examples/81_blackwell_gemm_blockwise/README.md +++ b/examples/81_blackwell_gemm_blockwise/README.md @@ -57,7 +57,7 @@ The simplest way to use the profiler is to pass `m`, `n`, and `k` as well as you per kernel to determine best rasterization orders, swizzles, and cluster sizes. Passing `blockwiseGemm` or `GroupedGemm` through the operation flag will determine which set of operations will be profiled. -For examle, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale +For example, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale granularity m = 1, scale granularity n = 128, and scale granularity k = 128 for the problem size 8192x8192x8192: ``` cutlass_profiler --operation=blockwiseGemm \ @@ -92,7 +92,7 @@ It is also worthwhile to note that C can be void if scaling by beta is not neede - *MMA Dimensions*: in both Blackwell and Hopper tensor cores it is worthwhile to note that the smallest `MMA_M` dimension is 64, but `MMA_N` dimension can be as small as 8 for some instructions. For problem sizes where M is small consider computing $D^T = \alpha B^T A^T + \beta C^T$ instead. - - When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unecessary computation. + - When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unnecessary computation. - *Layout Swapping*: When optimizing with the profiler swap `m` and `n` inputs and adjust layouts to reflect this swapping and transposing. - For example if we have a row-major A, column-major B, and row-major D, we can swap tensors and run a kernel with: - The left hand matrix as row-major (since B transposed is row-major) diff --git a/examples/87_blackwell_geforce_gemm_blockwise/utils.h b/examples/87_blackwell_geforce_gemm_blockwise/utils.h index 7273530388..96dbc8a7d7 100644 --- a/examples/87_blackwell_geforce_gemm_blockwise/utils.h +++ b/examples/87_blackwell_geforce_gemm_blockwise/utils.h @@ -76,7 +76,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/README.md b/examples/README.md index 4765125fd2..5d04048ea4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -244,7 +244,7 @@ * [58_ada_fp8_gemm](58_ada_fp8_gemm/) - Ada GEMM kernel targetting Ada FP8 tensor cores via the CUTLASS 2.x API. + Ada GEMM kernel targeting Ada FP8 tensor cores via the CUTLASS 2.x API. * [59_ampere_gather_scatter_conv](59_ampere_gather_scatter_conv/) diff --git a/examples/cute/tutorial/tiled_copy_if.cu b/examples/cute/tutorial/tiled_copy_if.cu index 17d7de1a0d..0ce6db02fa 100644 --- a/examples/cute/tutorial/tiled_copy_if.cu +++ b/examples/cute/tutorial/tiled_copy_if.cu @@ -177,7 +177,7 @@ int main(int argc, char** argv) // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile // shape, and modes (m', n') correspond to the number of tiles. // - // These will be used to determine the CUDA kernel grid dimensinos. + // These will be used to determine the CUDA kernel grid dimensions. Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape); // ((M, N), m', n') // Describes the layout of threads which is then replicated to tile 'block_shape.' @@ -263,7 +263,7 @@ int main(int argc, char** argv) // Construct tiled copy, a tiling of copy atoms. // - // Note, this assumes the vector and thread layouts are aligned with contigous data + // Note, this assumes the vector and thread layouts are aligned with contiguous data // in GMEM. Alternative thread layouts are possible but may result in uncoalesced // reads. Alternative value layouts are also possible, though incompatible layouts // will result in compile time errors. diff --git a/examples/python/CuTeDSL/ampere/smem_allocator.py b/examples/python/CuTeDSL/ampere/smem_allocator.py index 8c54a5a6d8..c6d9bdf464 100644 --- a/examples/python/CuTeDSL/ampere/smem_allocator.py +++ b/examples/python/CuTeDSL/ampere/smem_allocator.py @@ -90,7 +90,7 @@ def kernel( dst_c: cute.Tensor, ): # Note: SMEM_SIZE bytes (specified in kernel().launch(smem=...)) can be reserved for developer to utilize - # Note: alignment of inital allocator base ptr is 1024 + # Note: alignment of initial allocator base ptr is 1024 allocator = cutlass.utils.SmemAllocator() # base ptr of allocator points at: SMEM_ADDR_START (the starting address of available shared memory) diff --git a/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py b/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py index e77221bb87..d3ea1c4f5d 100644 --- a/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py +++ b/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py @@ -221,7 +221,7 @@ def _setup_attributes(self): self.internal_stages, ) - # B needs to be proprocessed to be used as A operand of INTER1_MMA + # B needs to be preprocessed to be used as A operand of INTER1_MMA self.bt_smem_layout = cute.coalesce( sm100_utils.make_smem_layout_epi( self.io_dtype, diff --git a/examples/python/CuTeDSL/hopper/dense_gemm.py b/examples/python/CuTeDSL/hopper/dense_gemm.py index 6bab06ea2f..2f328af949 100644 --- a/examples/python/CuTeDSL/hopper/dense_gemm.py +++ b/examples/python/CuTeDSL/hopper/dense_gemm.py @@ -602,7 +602,7 @@ def kernel( mainloop_pipeline_producer_group = pipeline.CooperativeGroup( pipeline.Agent.Thread ) - # Each warp will constribute to the arrive count with the number of mcast size + # Each warp will contribute to the arrive count with the number of mcast size mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1 num_warps = self.threads_per_cta // 32 consumer_arrive_cnt = mcast_size * num_warps diff --git a/examples/python/deprecated/00_basic_gemm.ipynb b/examples/python/deprecated/00_basic_gemm.ipynb index 8e325f4d28..58a13d8fd5 100644 --- a/examples/python/deprecated/00_basic_gemm.ipynb +++ b/examples/python/deprecated/00_basic_gemm.ipynb @@ -133,7 +133,7 @@ "id": "4a5856de", "metadata": {}, "source": [ - "There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor." + "There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor." ] }, { @@ -197,7 +197,7 @@ "\n", "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n", "\n", - "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization." + "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on initialization." ] }, { diff --git a/examples/python/deprecated/01_epilogue.ipynb b/examples/python/deprecated/01_epilogue.ipynb index f5196d4409..c4f3d35dd2 100644 --- a/examples/python/deprecated/01_epilogue.ipynb +++ b/examples/python/deprecated/01_epilogue.ipynb @@ -70,7 +70,7 @@ "\n", "import cutlass_cppgen\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", diff --git a/examples/python/deprecated/03_basic_conv2d.ipynb b/examples/python/deprecated/03_basic_conv2d.ipynb index aa41997b33..c3395e4fc9 100644 --- a/examples/python/deprecated/03_basic_conv2d.ipynb +++ b/examples/python/deprecated/03_basic_conv2d.ipynb @@ -113,7 +113,7 @@ "\n", "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n", "\n", - "Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n", + "Assuming that we are running on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n", "\n", "Calling `plan.run()` will generate the CUTLASS C++ kernel in question, compile it, and run it on the tensors we previously passed in. By setting `print_module` to `true`, the C++ code that is emitted is printed." ] diff --git a/examples/python/deprecated/04_epilogue_visitor.ipynb b/examples/python/deprecated/04_epilogue_visitor.ipynb index 6ba68aadd7..92cecddaf0 100644 --- a/examples/python/deprecated/04_epilogue_visitor.ipynb +++ b/examples/python/deprecated/04_epilogue_visitor.ipynb @@ -72,7 +72,7 @@ "from cutlass_cppgen import Tensor as FakeTensor\n", "from cutlass_cppgen.utils.profiler import CUDAEventProfiler\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", @@ -108,7 +108,7 @@ "metadata": {}, "source": [ "## Define the epilogue visitor functor\n", - "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n", + "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmetic operations and built-in callable functions like `relu`. It also accommodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n", "* Each named variable must be assigned exactly once and defined before it used.\n", "* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n", "* Return values must be a named variable.\n",