diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index df8009e0ea..5d1c0c15ea 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -47,7 +47,7 @@ or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are prevalent in the CUTLASS unit tests. - This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to + This example has deliberately been kept similar to the basic_gemm example from cutlass-1.3 to highlight the minimum amount of differences needed to transition to cutlass-2.0. Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu index 8bad0bbdaa..37cbc1a368 100644 --- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu +++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu @@ -45,7 +45,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute threadblock-tile (tile size computed by a threadblock). -In thie example, we split variable initialization into +In this example, we split variable initialization into 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel can view them (logical to physical mapping) 2. Setting up computation properties : describes how the above set matrices will be used to compute diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index 70afef7e59..da157470fd 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -45,7 +45,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute threadblock-tile (tile size computed by a threadblock). -In thie example, we split variable initialization into +In this example, we split variable initialization into 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel can view them (logical to physical mapping) 2. Setting up computation properties : describes how the above set matrices will be used to compute diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index b58896458a..8d510ebb5c 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -47,7 +47,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute threadblock-tile (tile size computed by a threadblock). -In thie example, we split variable initialization into +In this example, we split variable initialization into 1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel can view them (logical to physical mapping) 2. Setting up computation properties : describes how the above set tensors will be used to compute diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h index df4cb76ad1..e9aac97abf 100644 --- a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h @@ -560,7 +560,7 @@ class B2bFusedConv2dRun { if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.N*P*Q = problem_size_1.N*P*Q\n" << " problem_size_0.K = problem_size_1.C\n" << " problem_size_1.R = problem_size_1.S = 1\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h index f0e85cda3a..6391f41dd5 100644 --- a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h @@ -604,7 +604,7 @@ struct B2bFusedGemmRun if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.M = problem_size_1.M\n" << " problem_size_0.N = problem_size_1.K\n" << " ThreadblockShape0::kN = problem_size_0.N\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h index b6267a153b..ed88eb67cb 100644 --- a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h @@ -302,7 +302,7 @@ struct B2bFusedGroupedGemmRun if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.M = problem_size_1.M\n" << " problem_size_0.N = problem_size_1.K\n" << " ThreadblockShape0::kN = problem_size_0.N\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h index 4693e86423..8221273869 100644 --- a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h @@ -586,7 +586,7 @@ class B2bInterleavedFusedConv2dRun { if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.N*P*Q = problem_size_1.N*P*Q\n" << " problem_size_0.K = problem_size_1.C\n" << " problem_size_1.R = problem_size_1.S = 1\n" diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h index 453f44cd0c..a8b627a037 100644 --- a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h @@ -638,7 +638,7 @@ struct B2bInterleavedFusedGemmRun if(status != cutlass::Status::kSuccess) { std::cout << "Problem sizes not supported.\n" - << "Requirments:\n" + << "Requirements:\n" << " problem_size_0.M = problem_size_1.M\n" << " problem_size_0.N = problem_size_1.K\n" << " ThreadblockShape0::kN = problem_size_0.N\n" diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h index c845f2023f..139fa90a9e 100644 --- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h @@ -97,7 +97,7 @@ class B2bMmaBase { Shape1::kN / WarpGemm1::kN, Shape1::kK / WarpGemm1::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations0 = (WarpGemm0::kK / Operator0::Policy::MmaShape::kK); static int const kWarpGemmIterations1 = diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu index 8e0094f69a..90f5fe7099 100644 --- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu +++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu @@ -41,7 +41,7 @@ change to method to visit the global memory: address addr = base_pointer + coord1 * stride1 + coord2 * stride2 -The rest part of GEMM which includes shared memory load/store, mma comutation +The rest part of GEMM which includes shared memory load/store, mma computation is the same. This example uses Ampere fp64 tensore core Affine2 GEMM as an example. SIMT diff --git a/examples/39_gemm_permute/layouts.h b/examples/39_gemm_permute/layouts.h index d4d9ed3166..7b277bc259 100644 --- a/examples/39_gemm_permute/layouts.h +++ b/examples/39_gemm_permute/layouts.h @@ -71,7 +71,7 @@ class TensorCWHN { // Data members // - /// Stride data member - [n, hn, whn] + /// Stride data member - [n, hn, when] Stride stride_; public: diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h index 66c099d15b..f3c26fb986 100644 --- a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h +++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h @@ -87,7 +87,7 @@ class CustomMmaBase { Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); diff --git a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h index f2b94d0031..eb1917d206 100644 --- a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h +++ b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h @@ -167,7 +167,7 @@ class MmaBaseFromSharedMemory { Shape::kK / WarpGemm::kK>; using WarpCount1 = WarpCount; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); static int const kWarpGemmIterations1 = kWarpGemmIterations; diff --git a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h index 7a52e96a36..b3e9e1a2fb 100644 --- a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h +++ b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h @@ -67,7 +67,7 @@ namespace threadblock { /// ForwardTileIterator /// template < - typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename ThreadMap_, ///< Thread map (concept: OutputTileThreadMap) typename Element_, ///< Element data type bool ScatterD = false, ///< Scatter D operand or not bool UseCUDAStore = false> diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py index 6cd01ef16b..081aaaca75 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py @@ -45,7 +45,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep self.b2b_num = len(fuse_gemm_info) self.user_header_file = user_header_file self.args = {} - # device arg struct memebr + # device arg struct member self.arg_member = [] self.gen_class_name = gen_class_name self.gen_kernel_name = gen_class_name + "Kernel" diff --git a/examples/45_dual_gemm/threadblock/dual_mma_base.h b/examples/45_dual_gemm/threadblock/dual_mma_base.h index 754719033e..8aee22abb5 100644 --- a/examples/45_dual_gemm/threadblock/dual_mma_base.h +++ b/examples/45_dual_gemm/threadblock/dual_mma_base.h @@ -91,7 +91,7 @@ class DualMmaBase { Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator0::Policy::MmaShape::kK); diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu index d5758aa219..98dd043a3f 100644 --- a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu +++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu @@ -238,7 +238,7 @@ bool initialize_block( // to select an appropriate value on its own. The CollectiveBuilder will attempt to select // configurations that will result in the most-performant kernel, but this is not a guarantee. // -// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatiblity. +// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatibility. // For example, if `KernelScheduleAuto` is used for the mainloop builder, `EpilogueScheduleAuto` must // be used for the epilogue builder. // diff --git a/examples/51_hopper_gett/51_hopper_gett.cu b/examples/51_hopper_gett/51_hopper_gett.cu index f2eb5c0bd3..c0df9029fe 100644 --- a/examples/51_hopper_gett/51_hopper_gett.cu +++ b/examples/51_hopper_gett/51_hopper_gett.cu @@ -58,7 +58,7 @@ In the following example, we illustrate how every Hopper GEMM in CUTLASS 3.0 is a GETT in disguise. We begin by defining the four modes detailed above as Row, Col (column), Red (reduction), and Bat (batch) strides, which we then nest for each of the in/out tensors to create our rank-3 stride - tuples. Note that although we do not define the problem shape type explicitely, it too remains a + tuples. Note that although we do not define the problem shape type explicitly, it too remains a rank-4 shape tuple just like any other batched GEMM, but instead with multi-mode shapes for each of the four corresponding multi-modes within it. After this, the same CollectiveMma and CollectiveBuilder we describe in examples 50 and 49 are used to create our kernel type. Nothing diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu index 2d2b719718..1ac54109b8 100644 --- a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu +++ b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu @@ -180,7 +180,7 @@ struct Options { " --alpha= GEMM alpha parameter\n" " --beta= GEMM beta parameter\n" " --iterations= Number of profiling iterations to perform.\n" - " --check= Validate results against a reference (unfused) imlementation" + " --check= Validate results against a reference (unfused) implementation" " --verbose= Enable verbose output" "\n" "Examples:\n" diff --git a/examples/59_ampere_gather_scatter_conv/README.md b/examples/59_ampere_gather_scatter_conv/README.md index b16ddf9572..1e5ce8f9c0 100644 --- a/examples/59_ampere_gather_scatter_conv/README.md +++ b/examples/59_ampere_gather_scatter_conv/README.md @@ -85,8 +85,8 @@ Now that we have transformed our problem in such a way that allows us to dispatc we can reuse much of the machinery CUTLASS offers to implement this forward pass convolution operator. CUTLASS decomposes these "moving parts" of GPU linear algebra into reusable, modular software components abstracted by C++ template classes. This example -demonstrates how some of the lower layers of the hierarchy can be re-used for custom kernels -by writing a custom kernel for convolution that re-uses the Ampere/Ada GEMM collectives +demonstrates how some of the lower layers of the hierarchy can be reused for custom kernels +by writing a custom kernel for convolution that reuses the Ampere/Ada GEMM collectives from CUTLASS 3. A kernel author is free to compose their custom components with any of the existing templates diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu index f080b6c69d..8cd13d43a1 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu @@ -302,7 +302,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -340,7 +340,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu index 19e012b009..7166dd15d4 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu @@ -312,7 +312,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -350,7 +350,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp index 616357cb0e..008cfd1ca8 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp @@ -118,7 +118,7 @@ struct Sm100FmhaFwdEpilogueTmaWarpspecialized { auto cumulative_length_q = get<0>(problem_shape).cumulative_length; if (cumulative_length_q != nullptr) { int max_length_q = get<0>(problem_shape).max_length; - // for variable sequence lenght, the batch is in units of row_stride + // for variable sequence length, the batch is in units of row_stride get<2,1>(dO) = get<0>(dO); get<2,1>(problem_shape_O) = max_length_q * (1 + get<2,1>(problem_shape_O)); // offset ptr by the amount we add back in later diff --git a/examples/77_blackwell_fmha/common/pipeline_mla.hpp b/examples/77_blackwell_fmha/common/pipeline_mla.hpp index 5bbeed9106..b9d05c5662 100644 --- a/examples/77_blackwell_fmha/common/pipeline_mla.hpp +++ b/examples/77_blackwell_fmha/common/pipeline_mla.hpp @@ -224,7 +224,7 @@ class PipelineTmaAsyncMla { static constexpr bool is_2sm_mma = size(AtomThrShape_MNK{}) > 1; // Consumer signalling Producer of completion - // Ensures all blocks in the Same Row and Column get notifed. + // Ensures all blocks in the Same Row and Column get notified. CUTLASS_DEVICE void consumer_release(uint32_t stage, uint32_t skip) { detail::pipeline_check_is_consumer(params_.role); diff --git a/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp index e9edb90e57..7017d60f31 100644 --- a/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp @@ -435,7 +435,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { size_t workspace_size {0}; if (args.is_fused_reduction && args.split_kv > 1) { // one exchange buffer for LSE max and another buffer for total LSE - // two locks per batch, frist lock is for CTA0 / H=0..63 and the second is for CTA1 / H=64..127 + // two locks per batch, first lock is for CTA0 / H=0..63 and the second is for CTA1 / H=64..127 workspace_size = H * B * (sizeof(int) + sizeof(ElementLSE)) + 2 * B * sizeof(int); } else if (!args.is_fused_reduction && args.split_kv > 1) { workspace_size = (sizeof(ElementAcc) * D_latent + sizeof(ElementLSE)) * H * split_kv * B; diff --git a/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp b/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp index c83ebdb747..9897627546 100644 --- a/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp +++ b/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp @@ -186,7 +186,7 @@ void fmha_mla_reference( shared_mem); if (cudaSuccess != result) { result = cudaGetLastError(); // to clear the error bit - throw std::runtime_error("couldn't perform smem optin"); + throw std::runtime_error("couldn't perform smem option"); } } fmha_mla_reference_kernel<<>>( diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu index 10cfe89d3c..f9ff4223da 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu @@ -301,7 +301,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -339,7 +339,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu index 6d8d1de019..862ef680f7 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu @@ -308,7 +308,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -346,7 +346,7 @@ bool initialize_scale_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu index b43869e7f1..60010aa0be 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu @@ -349,7 +349,7 @@ bool initialize_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -390,7 +390,7 @@ bool initialize_scale_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu index 60667cda29..117661cc80 100644 --- a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu +++ b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu @@ -356,7 +356,7 @@ bool initialize_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; @@ -397,7 +397,7 @@ bool initialize_scale_tensor( cutlass::reference::host::TensorFill(view, Element(1)); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/81_blackwell_gemm_blockwise/README.md b/examples/81_blackwell_gemm_blockwise/README.md index 9fe03bab7a..add3469508 100644 --- a/examples/81_blackwell_gemm_blockwise/README.md +++ b/examples/81_blackwell_gemm_blockwise/README.md @@ -57,7 +57,7 @@ The simplest way to use the profiler is to pass `m`, `n`, and `k` as well as you per kernel to determine best rasterization orders, swizzles, and cluster sizes. Passing `blockwiseGemm` or `GroupedGemm` through the operation flag will determine which set of operations will be profiled. -For examle, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale +For example, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale granularity m = 1, scale granularity n = 128, and scale granularity k = 128 for the problem size 8192x8192x8192: ``` cutlass_profiler --operation=blockwiseGemm \ @@ -92,7 +92,7 @@ It is also worthwhile to note that C can be void if scaling by beta is not neede - *MMA Dimensions*: in both Blackwell and Hopper tensor cores it is worthwhile to note that the smallest `MMA_M` dimension is 64, but `MMA_N` dimension can be as small as 8 for some instructions. For problem sizes where M is small consider computing $D^T = \alpha B^T A^T + \beta C^T$ instead. - - When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unecessary computation. + - When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unnecessary computation. - *Layout Swapping*: When optimizing with the profiler swap `m` and `n` inputs and adjust layouts to reflect this swapping and transposing. - For example if we have a row-major A, column-major B, and row-major D, we can swap tensors and run a kernel with: - The left hand matrix as row-major (since B transposed is row-major) diff --git a/examples/87_blackwell_geforce_gemm_blockwise/utils.h b/examples/87_blackwell_geforce_gemm_blockwise/utils.h index 7273530388..96dbc8a7d7 100644 --- a/examples/87_blackwell_geforce_gemm_blockwise/utils.h +++ b/examples/87_blackwell_geforce_gemm_blockwise/utils.h @@ -76,7 +76,7 @@ bool initialize_tensor( cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); } else { - throw std::runtime_error("Not implementated."); + throw std::runtime_error("Not implemented."); } return true; diff --git a/examples/README.md b/examples/README.md index 4765125fd2..5d04048ea4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -244,7 +244,7 @@ * [58_ada_fp8_gemm](58_ada_fp8_gemm/) - Ada GEMM kernel targetting Ada FP8 tensor cores via the CUTLASS 2.x API. + Ada GEMM kernel targeting Ada FP8 tensor cores via the CUTLASS 2.x API. * [59_ampere_gather_scatter_conv](59_ampere_gather_scatter_conv/) diff --git a/examples/cute/tutorial/tiled_copy_if.cu b/examples/cute/tutorial/tiled_copy_if.cu index 17d7de1a0d..0ce6db02fa 100644 --- a/examples/cute/tutorial/tiled_copy_if.cu +++ b/examples/cute/tutorial/tiled_copy_if.cu @@ -177,7 +177,7 @@ int main(int argc, char** argv) // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile // shape, and modes (m', n') correspond to the number of tiles. // - // These will be used to determine the CUDA kernel grid dimensinos. + // These will be used to determine the CUDA kernel grid dimensions. Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape); // ((M, N), m', n') // Describes the layout of threads which is then replicated to tile 'block_shape.' @@ -263,7 +263,7 @@ int main(int argc, char** argv) // Construct tiled copy, a tiling of copy atoms. // - // Note, this assumes the vector and thread layouts are aligned with contigous data + // Note, this assumes the vector and thread layouts are aligned with contiguous data // in GMEM. Alternative thread layouts are possible but may result in uncoalesced // reads. Alternative value layouts are also possible, though incompatible layouts // will result in compile time errors. diff --git a/examples/python/CuTeDSL/ampere/smem_allocator.py b/examples/python/CuTeDSL/ampere/smem_allocator.py index f9f5c1e03f..95f747b010 100644 --- a/examples/python/CuTeDSL/ampere/smem_allocator.py +++ b/examples/python/CuTeDSL/ampere/smem_allocator.py @@ -90,7 +90,7 @@ def kernel( dst_c: cute.Tensor, ): # Note: SMEM_SIZE bytes (specified in kernel().launch(smem=...)) can be reserved for developer to utilize - # Note: alignment of inital allocator base ptr is 1024 + # Note: alignment of initial allocator base ptr is 1024 allocator = cutlass.utils.SmemAllocator() # base ptr of allocator points at: SMEM_ADDR_START (the starting address of available shared memory) diff --git a/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py b/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py index 829d6b7eca..a111e7b8f8 100644 --- a/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py +++ b/examples/python/CuTeDSL/blackwell/mamba2_ssd/mamba2_ssd.py @@ -221,7 +221,7 @@ def _setup_attributes(self): self.internal_stages, ) - # B needs to be proprocessed to be used as A operand of INTER1_MMA + # B needs to be preprocessed to be used as A operand of INTER1_MMA self.bt_smem_layout = cute.coalesce( sm100_utils.make_smem_layout_epi( self.io_dtype, diff --git a/examples/python/CuTeDSL/hopper/dense_gemm.py b/examples/python/CuTeDSL/hopper/dense_gemm.py index c59ace02d6..3eb790cb2c 100644 --- a/examples/python/CuTeDSL/hopper/dense_gemm.py +++ b/examples/python/CuTeDSL/hopper/dense_gemm.py @@ -605,7 +605,7 @@ def kernel( mainloop_pipeline_producer_group = pipeline.CooperativeGroup( pipeline.Agent.Thread ) - # Each warp will constribute to the arrive count with the number of mcast size + # Each warp will contribute to the arrive count with the number of mcast size mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1 num_warps = self.threads_per_cta // 32 consumer_arrive_cnt = mcast_size * num_warps diff --git a/examples/python/deprecated/00_basic_gemm.ipynb b/examples/python/deprecated/00_basic_gemm.ipynb index 8e325f4d28..58a13d8fd5 100644 --- a/examples/python/deprecated/00_basic_gemm.ipynb +++ b/examples/python/deprecated/00_basic_gemm.ipynb @@ -133,7 +133,7 @@ "id": "4a5856de", "metadata": {}, "source": [ - "There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor." + "There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor." ] }, { @@ -197,7 +197,7 @@ "\n", "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n", "\n", - "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization." + "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on initialization." ] }, { diff --git a/examples/python/deprecated/01_epilogue.ipynb b/examples/python/deprecated/01_epilogue.ipynb index f5196d4409..c4f3d35dd2 100644 --- a/examples/python/deprecated/01_epilogue.ipynb +++ b/examples/python/deprecated/01_epilogue.ipynb @@ -70,7 +70,7 @@ "\n", "import cutlass_cppgen\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", diff --git a/examples/python/deprecated/03_basic_conv2d.ipynb b/examples/python/deprecated/03_basic_conv2d.ipynb index aa41997b33..c3395e4fc9 100644 --- a/examples/python/deprecated/03_basic_conv2d.ipynb +++ b/examples/python/deprecated/03_basic_conv2d.ipynb @@ -113,7 +113,7 @@ "\n", "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n", "\n", - "Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n", + "Assuming that we are running on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n", "\n", "Calling `plan.run()` will generate the CUTLASS C++ kernel in question, compile it, and run it on the tensors we previously passed in. By setting `print_module` to `true`, the C++ code that is emitted is printed." ] diff --git a/examples/python/deprecated/04_epilogue_visitor.ipynb b/examples/python/deprecated/04_epilogue_visitor.ipynb index 6ba68aadd7..92cecddaf0 100644 --- a/examples/python/deprecated/04_epilogue_visitor.ipynb +++ b/examples/python/deprecated/04_epilogue_visitor.ipynb @@ -72,7 +72,7 @@ "from cutlass_cppgen import Tensor as FakeTensor\n", "from cutlass_cppgen.utils.profiler import CUDAEventProfiler\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", @@ -108,7 +108,7 @@ "metadata": {}, "source": [ "## Define the epilogue visitor functor\n", - "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n", + "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmetic operations and built-in callable functions like `relu`. It also accommodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n", "* Each named variable must be assigned exactly once and defined before it used.\n", "* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n", "* Return values must be a named variable.\n", diff --git a/python/CuTeDSL/cutlass/base_dsl/ast_preprocessor.py b/python/CuTeDSL/cutlass/base_dsl/ast_preprocessor.py index 11f2d1ae84..4025317a8e 100644 --- a/python/CuTeDSL/cutlass/base_dsl/ast_preprocessor.py +++ b/python/CuTeDSL/cutlass/base_dsl/ast_preprocessor.py @@ -1009,7 +1009,7 @@ def _handle_negative_step(self, node, start_expr, stop_expr, step_expr): extra_exprs.append(step) extra_exprs.append(offset) - # Add this to begining of loop body + # Add this to beginning of loop body # for i in range(start, stop, step): # i = offset - i if isNegative else i assert isinstance(node.target, ast.Name) diff --git a/python/CuTeDSL/cutlass/cute/arch/smem.py b/python/CuTeDSL/cutlass/cute/arch/smem.py index 37f87ea64d..bfff87d538 100644 --- a/python/CuTeDSL/cutlass/cute/arch/smem.py +++ b/python/CuTeDSL/cutlass/cute/arch/smem.py @@ -77,7 +77,7 @@ def get_dyn_smem( :param alignment: An optional pointer alignment, the result pointer is offset appropriately :type alignment: int :return: A pointer to the start of the dynamic SMEM allocation with a correct - alignement + alignment :rtype: Pointer """ if not isinstance(element_type, NumericMeta): diff --git a/python/CuTeDSL/cutlass/cute/core.py b/python/CuTeDSL/cutlass/cute/core.py index 12d5e4221a..e84a320f7d 100644 --- a/python/CuTeDSL/cutlass/cute/core.py +++ b/python/CuTeDSL/cutlass/cute/core.py @@ -1723,7 +1723,7 @@ def print_tensor( # -# Utilties +# Utilities # @@ -4301,7 +4301,7 @@ def flat_divide(target, tiler: Tiler, *, loc=None, ip=None): # -# Higher-level utilties +# Higher-level utilities # @@ -6727,7 +6727,7 @@ class StorageA: intA : cutlass.Int16 - # Supports aligment for its elements: + # Supports alignment for its elements: @cute.struct class StorageB: a: cute.struct.Align[ @@ -7000,7 +7000,7 @@ def add_offset(val): f"Struct element only support struct/array/base_dsl scalar, " f"but got {object}" ) - # Total aligment determined by the strictest requirement + # Total alignment determined by the strictest requirement alignment = max(alignment, sub_align) # Total size determined by alignment self._align_of = alignment diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py index a154956023..a852e48d4d 100644 --- a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py @@ -27,7 +27,7 @@ #################################################################################################### # -# Aynchronous copies +# Asynchronous copies # #################################################################################################### @@ -125,7 +125,7 @@ class CopyG2STrait(Trait): @dataclass(frozen=True) class CopyBulkTensorTileG2SOp(CopyOp): """ - Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit. + Bulk tensor asynchronous GMEM to SMEM Copy Operation using the TMA unit. See the `PTX documentation `__. This Operation uses TMA in the ``.tile`` mode. @@ -227,7 +227,7 @@ def unpack( @dataclass(frozen=True) class CopyBulkTensorTileG2SMulticastOp(CopyOp): """ - Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit. + Bulk tensor asynchronous multicast GMEM to SMEM Copy Operation using the TMA unit. See the `PTX documentation `__. This Operation uses TMA in the ``.tile`` mode. diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py index f64f07f167..862274c02a 100644 --- a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py @@ -271,7 +271,7 @@ def update_tma_descriptor( :type tma_atom: CopyAtom :param gmem_tensor: The GMEM tensor :type gmem_tensor: Tensor - :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate + :param tensormap_ptr: The pointer to the memory location of the descriptor to update :type tensormap_ptr: Pointer """ _cute_nvgpu_ir.update_tma_desc( diff --git a/python/CuTeDSL/cutlass/cutlass_dsl/cutlass.py b/python/CuTeDSL/cutlass/cutlass_dsl/cutlass.py index 1630c873c7..9b59072de1 100644 --- a/python/CuTeDSL/cutlass/cutlass_dsl/cutlass.py +++ b/python/CuTeDSL/cutlass/cutlass_dsl/cutlass.py @@ -833,7 +833,7 @@ def count_values(args): # ============================================================================= -# DSL implementation of Python Build-in Operators +# DSL implementation of Python Built-in Operators # ============================================================================= diff --git a/python/CuTeDSL/cutlass/pipeline/sm100.py b/python/CuTeDSL/cutlass/pipeline/sm100.py index 2feed8cc0f..4c5652c34f 100644 --- a/python/CuTeDSL/cutlass/pipeline/sm100.py +++ b/python/CuTeDSL/cutlass/pipeline/sm100.py @@ -401,7 +401,7 @@ def create( producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk) if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1: - # Set mask to None if not using 2CTA intructions + # Set mask to None if not using 2CTA instructions consumer_mask = None else: consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank() diff --git a/python/CuTeDSL/cutlass/torch.py b/python/CuTeDSL/cutlass/torch.py index e5ee5777ca..a541c081b8 100644 --- a/python/CuTeDSL/cutlass/torch.py +++ b/python/CuTeDSL/cutlass/torch.py @@ -169,7 +169,7 @@ def convert_cute_tensor( ) -> Tensor: """ Change the value of the cute tensor to make its value converted from a fp32 torch tensor. - Used for fp8 types tensor creatation now. + Used for fp8 types tensor creation now. """ # if torch_tensor is on cpu, create a gpu copy if f32_torch_tensor.device.type == "cpu": diff --git a/python/CuTeDSL/cutlass/utils/README.md b/python/CuTeDSL/cutlass/utils/README.md index 3a583ed49f..d6a6860a58 100644 --- a/python/CuTeDSL/cutlass/utils/README.md +++ b/python/CuTeDSL/cutlass/utils/README.md @@ -1,9 +1,9 @@ # Utilities -This folder contains various utilties for kernel authoring. Specifically, the implementation of the -followings can be considered experimental and subject to breaking changes: +This folder contains various utilities for kernel authoring. Specifically, the implementation of the +following can be considered experimental and subject to breaking changes: - static persistent tile scheduler defined in [`static_persistent_tile_scheduler.py`](./static_persistent_tile_scheduler.py) - pipeline abstractions defined in [`pipeline.py`](./pipeline.py) -- grouped GEMM utilties defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py) +- grouped GEMM utilities defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py) and [`tensormap_manager.py`](./tensormap_manager.py) diff --git a/python/cutlass_cppgen/backend/compiler.py b/python/cutlass_cppgen/backend/compiler.py index 0b66ce8a24..fa05d57f37 100644 --- a/python/cutlass_cppgen/backend/compiler.py +++ b/python/cutlass_cppgen/backend/compiler.py @@ -349,7 +349,7 @@ def emit_compile_(self, operation_list, compilation_options, host_compilation_op cmd.extend(host_compilation_options.get_str().split(" ")) cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"]) - # Comile and load the library + # Compile and load the library compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt") host_lib = ctypes.CDLL(temp_dst.name) diff --git a/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py b/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py index af147969f0..5e53119811 100644 --- a/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py +++ b/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py @@ -53,7 +53,7 @@ def __init__(self, dag_ir: DAGIR) -> None: def call(self): self.layout_nodes_worklist = self.get_all_layout_nodes() - # Run while loop utill all layout nodes are eliminated + # Run while loop until all layout nodes are eliminated while(len(self.layout_nodes_worklist) > 0): node = self.layout_nodes_worklist.pop(0) # for node in layout_nodes: diff --git a/python/cutlass_cppgen/backend/gemm_operation.py b/python/cutlass_cppgen/backend/gemm_operation.py index 5e2a3a30a0..32825ae6d4 100644 --- a/python/cutlass_cppgen/backend/gemm_operation.py +++ b/python/cutlass_cppgen/backend/gemm_operation.py @@ -113,7 +113,7 @@ def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int: """ - Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``. + Returns the leading dimension of a tensor with layout ``layout`` and shape ``shape``. :param layout: layout of the tensor :type layout: cutlass_cppgen.shape.LayoutType @@ -1514,7 +1514,7 @@ def __init__( # Optionally swap the TensorDescriptions for operands A and B and transpose their # layouts. This is needed to mimic the transpose performed by device::GemmUniversal. - # The code below uses deep copy to avoid overwritting the original TensorDescription + # The code below uses deep copy to avoid overwriting the original TensorDescription self.switched = (self.api != ApiVersion.v3x and self.emission_type == EmissionType.Kernel and C.layout == LayoutType.ColumnMajor) @@ -1779,7 +1779,7 @@ def __init__(self, arch, tile_description: TileDescription, A: TensorDescription epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs): super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description, A, B, C, epilogue_functor, swizzling_functor, **kwargs) - assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'." + assert "precompute_mode" in kwargs.keys(), "missing keyword argument 'precompute_mode'." self.precompute_mode = kwargs["precompute_mode"] self.rt_module = GemmRTGrouped(self) self.argument_type = self.rt_module.argument_type diff --git a/python/cutlass_cppgen/backend/library.py b/python/cutlass_cppgen/backend/library.py index a77b302dcc..6379ebf9f9 100644 --- a/python/cutlass_cppgen/backend/library.py +++ b/python/cutlass_cppgen/backend/library.py @@ -275,7 +275,7 @@ def __init__( """ :param threadblock_shape: shape of a threadblock tyle :type threadblock_shape: list or tuple - :param stages: number of pipline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum + :param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum number of stages that can be supported for an operation on a given architecture will be computed at a later time :type stages: int or None :param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile diff --git a/python/cutlass_cppgen/backend/reduction_operation.py b/python/cutlass_cppgen/backend/reduction_operation.py index 535cea2cb2..8fb6ed162d 100644 --- a/python/cutlass_cppgen/backend/reduction_operation.py +++ b/python/cutlass_cppgen/backend/reduction_operation.py @@ -377,7 +377,7 @@ def configuration_name(self): ) def procedural_name(self): - """The full procedural name indicates architeture, extended name, tile size""" + """The full procedural name indicates architecture, extended name, tile size""" return self.configuration_name() def run(self, arguments: ReductionArguments) -> cuda.CUresult: diff --git a/python/cutlass_cppgen/backend/utils/device.py b/python/cutlass_cppgen/backend/utils/device.py index 9ed4096a6f..23c458c8dc 100644 --- a/python/cutlass_cppgen/backend/utils/device.py +++ b/python/cutlass_cppgen/backend/utils/device.py @@ -93,7 +93,7 @@ def device_sm_count(device: int = -1): ) if err != cuda.CUresult.CUDA_SUCCESS: raise Exception( - "Failed to retireve SM count. " + "Failed to retrieve SM count. " f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}" ) diff --git a/python/cutlass_cppgen/emit/pytorch.py b/python/cutlass_cppgen/emit/pytorch.py index fe96f3ede1..375306660e 100644 --- a/python/cutlass_cppgen/emit/pytorch.py +++ b/python/cutlass_cppgen/emit/pytorch.py @@ -622,7 +622,7 @@ class _ArchListSetter: Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST`` environment variable when building a PyTorch CUDA module. - ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch + ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilities for which a PyTorch CUDA module should be compiled. For example, ``TORCH_CUDA_ARCH_LIST="7.0 8.0"`` would result in the inclusion of @@ -630,7 +630,7 @@ class _ArchListSetter: compilation of the module. This utility wraps the building of a PyTorch CUDA module with a setting of this environment - variable according to the current compute capability being targetted. + variable according to the current compute capability being targeted. Example usage: diff --git a/python/cutlass_cppgen/epilogue/epilogue.py b/python/cutlass_cppgen/epilogue/epilogue.py index a3a17506ee..130b829036 100644 --- a/python/cutlass_cppgen/epilogue/epilogue.py +++ b/python/cutlass_cppgen/epilogue/epilogue.py @@ -122,7 +122,7 @@ def trace(fn, example_tensors, **kwargs): :param example_tensors: example inputs for fn :type example_tensors: dict - .. hightlight:: python + .. highlight:: python .. code-block:: python import cutlass_cppgen.backend.evt diff --git a/python/cutlass_cppgen/op/conv.py b/python/cutlass_cppgen/op/conv.py index 711b27da13..145d3dc15b 100644 --- a/python/cutlass_cppgen/op/conv.py +++ b/python/cutlass_cppgen/op/conv.py @@ -183,7 +183,7 @@ class Conv2d(OperationBase): :param B: tensor representing data type of operand B :param C: tensor representing data type of operand C :param D: tensor representing data type of operand D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type :type element: cutlass_cppgen.DataType @@ -749,7 +749,7 @@ def run(self, A=None, B=None, C=None, D=None, By default, this call returns only once the kernel has completed. To launch the kernel and immediately return, set ``sync=False``. In this case, it is the responsibility of the - caller to syncrhonize the results of the kernel before attempting to access outputs + caller to synchronize the results of the kernel before attempting to access outputs by calling ``sync()`` on the arguments returned from this call. :param A: tensor representing data type and layout of operand A @@ -759,7 +759,7 @@ def run(self, A=None, B=None, C=None, D=None, :param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1) :param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0) :param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1) - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param split_k: a tuple (split_k_mode, split_k_slices) :param sync: whether the call should wait for the kernel to complete before returning diff --git a/python/cutlass_cppgen/op/gemm.py b/python/cutlass_cppgen/op/gemm.py index a6f9b1ab43..dcdd1bca06 100644 --- a/python/cutlass_cppgen/op/gemm.py +++ b/python/cutlass_cppgen/op/gemm.py @@ -190,7 +190,7 @@ class Gemm(OperationBase): :param B: tensor representing data type and layout of operand B :param C: tensor representing data type and layout of operand C :param D: tensor representing data type and layout of operand D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param element_accumulator: data type to be used in accumulation of the product of operands A and B :type element_accumulator: cutlass_cppgen.DataType @@ -641,14 +641,14 @@ def run(self, A=None, B=None, C=None, D=None, By default, this call returns only once the kernel has completed. To launch the kernel and immediately return, set ``sync=False``. In this case, it is the responsibility of the - caller to syncrhonize the results of the kernel before attempting to access outputs + caller to synchronize the results of the kernel before attempting to access outputs by calling ``sync()`` on the arguments returned from this call. :param A: tensor representing data type and layout of operand A :param B: tensor representing data type and layout of operand B :param C: tensor representing data type and layout of operand C :param D: tensor representing data type and layout of operand D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param sync: whether the call should wait for the kernel to complete before returning :type sync: bool diff --git a/python/cutlass_cppgen/op/gemm_grouped.py b/python/cutlass_cppgen/op/gemm_grouped.py index 59f90535c2..0ba3b73fa8 100644 --- a/python/cutlass_cppgen/op/gemm_grouped.py +++ b/python/cutlass_cppgen/op/gemm_grouped.py @@ -87,7 +87,7 @@ class GroupedGemm(Gemm): :param B: tensor representing data type and layout of operands B :param C: tensor representing data type and layout of operands C :param D: tensor representing data type and layout of operands D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param element_accumulator: data type to be used in accumulation of the product of operands A and B :type element_accumulator: cutlass_cppgen.DataType @@ -204,7 +204,7 @@ def run(self, A, B, C, D, By default, this call returns only once the kernel has completed. To launch the kernel and immediately return, set ``sync=False``. In this case, it is the responsibility of the - caller to syncrhonize the results of the kernel before attempting to access outputs + caller to synchronize the results of the kernel before attempting to access outputs by calling ``sync()`` on the arguments returned from this call. :param A: list of tensors representing data type and layout of operand A @@ -215,7 +215,7 @@ def run(self, A, B, C, D, :type C: list :param D: list of tensors representing data type and layout of operand D :type D: list - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param sync: whether the call should wait for the kernel to complete before returning :type sync: bool diff --git a/python/cutlass_cppgen/op/op.py b/python/cutlass_cppgen/op/op.py index bebf07a7e5..e43ca23e47 100644 --- a/python/cutlass_cppgen/op/op.py +++ b/python/cutlass_cppgen/op/op.py @@ -425,7 +425,7 @@ def epilogue_visitor(self, visitor): def run_setup(self): """ - Steps that must be taken before caling `plan.run()` + Steps that must be taken before calling `plan.run()` """ # Initialize the memory pool if, if not already done cutlass_cppgen.get_memory_pool() diff --git a/python/cutlass_cppgen/utils/check.py b/python/cutlass_cppgen/utils/check.py index 108f268b4b..c8e796235c 100644 --- a/python/cutlass_cppgen/utils/check.py +++ b/python/cutlass_cppgen/utils/check.py @@ -140,7 +140,7 @@ def valid_stage_count( f"Details:\n" f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and " f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n" - f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.") + f"The maximum amount of shared memory that can be used per block on CC {cc} is {smem_arch}.") return (True, "") diff --git a/python/cutlass_library/heuristics.py b/python/cutlass_library/heuristics.py index 83421a0642..89923384df 100644 --- a/python/cutlass_library/heuristics.py +++ b/python/cutlass_library/heuristics.py @@ -67,7 +67,7 @@ def serialize_heuristics_results_to_json(problems_with_configs, outfile_path): """ - Utilitiy function to write heuristics results to a json file for debug + Utility function to write heuristics results to a json file for debug args: problems_with_configs: List of problems provided to the heuristic, with a list of operations added to each problem dict @@ -142,7 +142,7 @@ def get_gemm_configs(problems, provider=None, count=1): - 'batch_count': Number of GEMM operations in batch (default: 1) - 'use_fast_acc': Enable fast accumulation for FP8 on Hopper (default: True) provider: Heuristics provider to use - count: Number of configurations to return per problem (defualt: 1) + count: Number of configurations to return per problem (default: 1) returns: A copy of the input dictionary, with key `configs` added containing the selected gemm configs diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py index 5733ef2632..75ec76df33 100644 --- a/python/cutlass_library/manifest.py +++ b/python/cutlass_library/manifest.py @@ -31,7 +31,7 @@ ################################################################################################# """ -Utilities for filtering CUTLASS library kernels and emitting library intitialization +Utilities for filtering CUTLASS library kernels and emitting library initialization and building code """ diff --git a/python/docs/externals/00_basic_gemm.ipynb b/python/docs/externals/00_basic_gemm.ipynb index a18b320a84..f6396493f3 100644 --- a/python/docs/externals/00_basic_gemm.ipynb +++ b/python/docs/externals/00_basic_gemm.ipynb @@ -47,7 +47,7 @@ "\n", "import cutlass\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", @@ -154,7 +154,7 @@ "id": "4a5856de", "metadata": {}, "source": [ - "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor." + "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor." ] }, { @@ -236,7 +236,7 @@ "\n", "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n", "\n", - "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization." + "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on initialization." ] }, { diff --git a/python/docs/externals/01_epilogue.ipynb b/python/docs/externals/01_epilogue.ipynb index 2669802337..54a4864bad 100644 --- a/python/docs/externals/01_epilogue.ipynb +++ b/python/docs/externals/01_epilogue.ipynb @@ -47,7 +47,7 @@ "\n", "import cutlass\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", diff --git a/tools/library/include/cutlass/library/types.h b/tools/library/include/cutlass/library/types.h index 9f8c4ff13b..4fa829f37b 100644 --- a/tools/library/include/cutlass/library/types.h +++ b/tools/library/include/cutlass/library/types.h @@ -173,7 +173,7 @@ enum class SplitKMode { kInvalid }; -/// Indicates the classificaition of the math instruction +/// Indicates the classification of the math instruction enum class OpcodeClassID { kSimt, kTensorOp, diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h index f537421751..2840ab32eb 100644 --- a/tools/library/include/cutlass/library/util.h +++ b/tools/library/include/cutlass/library/util.h @@ -175,7 +175,7 @@ ConvKind from_string(std::string const &str); /// Converts a RuntimeDatatype enumerant to a string char const *to_string(cutlass::library::RuntimeDatatype type, bool pretty = false); -/// Convers a RuntimeDatatype enumerant from a string +/// Converts a RuntimeDatatype enumerant from a string template<> cutlass::library::RuntimeDatatype from_string(std::string const &str); @@ -183,14 +183,14 @@ cutlass::library::RuntimeDatatype from_string /// Converts a RasterOrder enumerant to a string char const *to_string(RasterOrder type, bool pretty = false); -/// Convers a RasterOrder enumerant from a string +/// Converts a RasterOrder enumerant from a string template<> RasterOrder from_string(std::string const &str); /// Converts a bool to a string char const *to_string(bool type, bool pretty = false); -/// Convers a bool from a string +/// Converts a bool from a string template<> bool from_string(std::string const &str); diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu index 00ad2e0ec0..76af5442ce 100644 --- a/tools/library/src/handle.cu +++ b/tools/library/src/handle.cu @@ -1100,7 +1100,7 @@ Operation const* find_conv_operation_for_parallel_reduction(Operation const *ope ConvDescription const &conv_desc = static_cast(operation->description()); - // if the curren conv operation accumulator and output data type match return operation + // if the current conv operation accumulator and output data type match return operation if(conv_desc.tile_description.math_instruction.element_accumulator == conv_desc.C.element) { return operation; } @@ -1145,7 +1145,7 @@ Operation const* find_conv_operation_for_parallel_reduction(Operation const *ope return nullptr; } - // return matching conv opertion (same tile sizes and instruction) + // return matching conv operation (same tile sizes and instruction) for (auto op : it->second) { if (op->description().tile_description == operation->description().tile_description) { return op; @@ -1163,7 +1163,7 @@ Operation const* find_gemm_operation_for_parallel_reduction(Operation const *ope GemmDescription const &gemm_desc = static_cast(operation->description()); - // if the curren gemm operation accumulator and output data type match return operation + // if the current gemm operation accumulator and output data type match return operation if(gemm_desc.tile_description.math_instruction.element_accumulator == gemm_desc.D.element) { return operation; } @@ -1214,7 +1214,7 @@ Operation const* find_gemm_operation_for_parallel_reduction(Operation const *ope return nullptr; } - // return matching gemm opertion (same tile shape, stages, warp count, and instruction) + // return matching gemm operation (same tile shape, stages, warp count, and instruction) for (auto op : it->second) { if (op->description().tile_description == operation->description().tile_description) { return op; diff --git a/tools/profiler/include/cutlass/profiler/problem_space.h b/tools/profiler/include/cutlass/profiler/problem_space.h index 9bdbec657c..fb0ad932fa 100644 --- a/tools/profiler/include/cutlass/profiler/problem_space.h +++ b/tools/profiler/include/cutlass/profiler/problem_space.h @@ -208,7 +208,7 @@ struct KernelArgument { virtual ~KernelArgument(); - /// Returns true if the kernel argument iself is empty + /// Returns true if the kernel argument itself is empty virtual bool not_null() const =0; /// Returns a string name for debugging diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index 7fc1d288a0..021edd77ba 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -534,7 +534,7 @@ void Options::Profiling::print_usage(std::ostream &out) const { << " --profiling-duration= " << " Time to spend profiling each kernel (ms)." << end_of_line - << " Overriden by `profiling-iterations` when `profiling-iterations` > 0." << end_of_line + << " Overridden by `profiling-iterations` when `profiling-iterations` > 0." << end_of_line << " Note that `min-iterations` must also be satisfied.\n\n" << " --min-iterations= " diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h index 7c6f803c47..ca8ab16eb2 100644 --- a/tools/util/include/cutlass/util/reference/device/convolution.h +++ b/tools/util/include/cutlass/util/reference/device/convolution.h @@ -131,7 +131,7 @@ __global__ void Conv2dFprop( for (int S = 0; S < problem_size.S; ++S) { for (int C = 0; C < problem_size.C; ++C) { - // Get group id of currnet channel + // Get group id of current channel int c_group_idx = C / c_per_group; // Load from activations tensor diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h index a738101660..a525f4dae5 100644 --- a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h +++ b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h @@ -185,7 +185,7 @@ void Rank2KComplex( } } - /* Zeoring out accum for second HERK */ + /* Zeroing out accum for second HERK */ for (int j = 0; j < Nblock; j++) { for (int i = 0; i < Mblock; i++) { accum[i][j] = initial_accum; diff --git a/tools/util/scripts/split_test_cmake.py b/tools/util/scripts/split_test_cmake.py index 6541ce1b26..77efa1be8f 100644 --- a/tools/util/scripts/split_test_cmake.py +++ b/tools/util/scripts/split_test_cmake.py @@ -282,14 +282,14 @@ def add_filler_text(text): if len(splits) > 1: if not splits[0].isspace(): - # Only add text to filler if there are non-whitespace charcters + # Only add text to filler if there are non-whitespace characters # preceding the TEST definition in the line filler_text += splits[0] # The new line is just the TEST-related line line = 'TEST' + splits[-1] - # Add tests and transtion to TestDeclaredWaitingStart state. + # Add tests and transition to TestDeclaredWaitingStart state. # Do not add the line to the test text of the new test case; this # will be done in either the TestDeclaredWaitingStart state processing # below or in the InTest state processing below.