File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed 
csrc/fused_moe/cutlass_backend Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -1457,7 +1457,7 @@ __host__ __device__ constexpr static U arrayConvert(T const& input) {
14571457//  (k-1)*rows_in_input all map to row 0 in the original matrix. Thus, to know where to read in the
14581458//  source matrix, we simply take the modulus of the expanded index.
14591459
1460- constexpr  static  int  EXPAND_THREADS_PER_BLOCK = 256 ;
1460+ constexpr  static  int  EXPAND_THREADS_PER_BLOCK = 128 ;
14611461
14621462template  <class  InputActivationsType , class  ExpandedActivationsType ,
14631463          TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType,
@@ -1697,7 +1697,7 @@ void expandInputRowsKernelLauncher(
16971697
16981698  static  int64_t  const  smCount = tensorrt_llm::common::getMultiProcessorCount ();
16991699  //  Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
1700-   int64_t  const  blocks = std::min (smCount * 8 , std::max (num_rows * k, num_padding_tokens));
1700+   int64_t  const  blocks = std::min (smCount * 16 , std::max (num_rows * k, num_padding_tokens));
17011701  int64_t  const  threads = EXPAND_THREADS_PER_BLOCK;
17021702
17031703  auto  func = [&]() {
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments