pytorch · cyyever · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -14,21 +14,17 @@ cppcoreguidelines-special-member-functions,
 bugprone*,
 -bugprone-crtp-constructor-accessibility,
 -bugprone-easily-swappable-parameters,
--bugprone-exception-escape,
 -bugprone-implicit*,
 -bugprone-macro-parentheses,
 -bugprone-narrowing-conversions,
 -bugprone-reserved-identifier,
--bugprone-signed-char-misuse,
 -bugprone-switch-missing-default-case,
 misc-*,
 -misc-confusable-identifiers,
 -misc-const-correctness,
 -misc-include-cleaner,
--misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
 -misc-use-anonymous-namespace,
--misc-unused-parameters,
 modernize*,
 -modernize-avoid-c-arrays,
 -modernize-avoid-bind,
@@ -49,7 +45,6 @@ readability*,
 -readability-avoid-unconditional-preprocessor-if,
 -readability-braces-around-statements,
 -readability-container-contains,
--readability-convert-member-functions-to-static,
 -readability-else-after-return,
 -readability-function-cognitive-complexity,
 -readability-identifier-length,

diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
@@ -218,11 +218,10 @@ void performance_test(
     int num_instances,
     bool flush,
     int repetitions,
-    bool is_mkl) {
+    bool is_mkl [[maybe_unused]]) {
 #ifdef USE_MKL
   mkl_set_xerbla((XerblaEntry)test_xerbla);
 #endif
-  (void)is_mkl; // Suppress unused variable warning
 
   float alpha = 1.f, beta = 1.f;
   matrix_op_t btran = matrix_op_t::Transpose;

diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
@@ -7,6 +7,7 @@
  */
 
 #include <algorithm>
+#include <cassert>
 #include <chrono>
 #include <cmath>
 #include <cstdio>

diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 #include <array>
-#include <cassert>
 #include <chrono>
 #include <cstdlib>
 #include <iostream>

diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
@@ -12,7 +12,6 @@
 // upgraded to match with new fbgemm interface.
 
 #include <cpuinfo.h>
-#include <cassert>
 
 #include "./FbgemmPackMatrixB.h" // @manual
 #include "./FloatConversion.h" // @manual

diff --git a/include/fbgemm/FbgemmFP32.h b/include/fbgemm/FbgemmFP32.h
@@ -6,7 +6,6 @@
 // upgraded to match with new fbgemm interface.
 
 #include <cpuinfo.h>
-#include <cassert>
 #include <cstdlib>
 #include <memory>
 #include <stdexcept>

diff --git a/include/fbgemm/FbgemmFPCommon.h b/include/fbgemm/FbgemmFPCommon.h
@@ -115,7 +115,7 @@ FBGEMM_API void cblas_gemm_compute(
 // autotuned kernel splits for various cases m = 1:mb_max
 template <typename T>
 void cblas_gemm_compute(
-    const matrix_op_t transa,
+    const matrix_op_t transa [[maybe_unused]],
     const int m,
     const float* A,
     const PackedGemmMatrixB<T>& Bp,
@@ -130,7 +130,6 @@ void cblas_gemm_compute(
   assert(cpuinfo_has_x86_f16c());
 #endif
   assert(transa == matrix_op_t::NoTranspose);
-  (void)transa; // Suppress unused variable warning
 
   const auto iset = fbgemmInstructionSet();
   // private scratchpad storage
@@ -170,12 +169,10 @@ void cblas_gemm_compute(
     assert(mb < static_cast<int64_t>(partition.size()));
     for (auto k_ind = 0; k_ind < k; k_ind += Bp.blockRowSize()) {
       // set up proper accumulation to avoid "Nan" problem
-      float beta_ = NAN;
-      if (k_ind == 0) {
-        // accumulate of beta != 0.0
-        // do not!!! accumulate otherwise
-        beta_ = beta;
-      } else {
+      // accumulate of beta != 0.0
+      // do not!!! accumulate otherwise
+      float beta_ = beta;
+      if (k_ind != 0) {
         // always accumulate with beta_ = 1.0f
         beta_ = 1.0f;
       }
@@ -278,8 +275,7 @@ void cblas_gemm_compute(
             // use one thread to handle the fringe cases
             if (thread_id == num_threads - 1) {
               // leftover
-              const int rem = n - last_blk_col;
-              (void)rem; // Suppress unused variable warning
+              const int rem [[maybe_unused]] = n - last_blk_col;
               assert(rem < Bp.blockColSize());
 
               // small temporary buffer: the size should be larger than the

diff --git a/include/fbgemm/FbgemmPackMatrixB.h b/include/fbgemm/FbgemmPackMatrixB.h
@@ -205,11 +205,11 @@ class PackedGemmMatrixB {
     uint64_t r = (uint64_t)r_;
     uint64_t c = (uint64_t)c_;
 
-    uint64_t block_row_id = r / blockRowSize(),
-             brow_offset =
+    uint64_t block_row_id = r / blockRowSize();
+    uint64_t brow_offset =
                  (block_row_id * nbcol_) * (blockRowSize() * blockColSize());
-    uint64_t block_col_id = c / blockColSize(),
-             bcol_offset = block_col_id *
+    uint64_t block_col_id = c / blockColSize();
+    uint64_t bcol_offset = block_col_id *
         ((static_cast<int64_t>(block_row_id) != nbrow_ - 1)
              ? (blockRowSize() * blockColSize())
              : (last_brow_ * blockColSize()));

diff --git a/include/fbgemm/FloatConversion.h b/include/fbgemm/FloatConversion.h
@@ -146,7 +146,7 @@ template <typename Src, typename Tgt, RoundingMode RoundingMode>
           tgt_mantissa += 1;
         }
       } else {
-        assert(RoundingMode == RoundingMode::ToZero);
+        static_assert(RoundingMode == RoundingMode::ToZero);
       }
       return tgt_sign | tgt_mantissa; // tgt_exponent == 0
     } else {
@@ -205,7 +205,7 @@ template <typename Src, typename Tgt, RoundingMode RoundingMode>
       }
     }
   } else {
-    assert(RoundingMode == RoundingMode::ToZero);
+    static_assert(RoundingMode == RoundingMode::ToZero);
   }
   return tgt_sign | tgt_exponent | tgt_mantissa;
 }

diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
@@ -95,10 +95,9 @@ ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE, outT, inT, nextOPType>::f(
         } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
           int g = block.col_start / ncol_per_group;
           Bq_zero_point_idx = g;
-        } else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
-          Bq_zero_point_idx = j;
         } else {
-          assert(false && "unknown quantization granularity");
+          static_assert(Q_GRAN == QuantizationGranularity::OUT_CHANNEL);
+          Bq_zero_point_idx = j;
         }
         if (q_row_offsets_) {
           raw -= q_row_offsets_[i - block.row_start] *
@@ -225,10 +224,9 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
         } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
           int g = block.col_start / ncol_per_group;
           Bq_zero_point_idx = g;
-        } else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
-          Bq_zero_point_idx = j;
         } else {
-          assert(false && "unknown quantization granularity");
+          static_assert(Q_GRAN == QuantizationGranularity::OUT_CHANNEL);
+          Bq_zero_point_idx = j;
         }
         if (q_row_offsets_) {
           raw -= q_row_offsets_[i - block.row_start] *
@@ -239,7 +237,7 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
           res += bias_[j];
         }
         out[i * ld_out + j] = res;
-        if (FUSE_RELU) {
+        if constexpr (FUSE_RELU) {
           out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]);
         }
       }

diff --git a/include/fbgemm/QuantUtils.h b/include/fbgemm/QuantUtils.h
@@ -87,7 +87,7 @@ T Quantize(
   // and nearbyint(2.5) is 2.0
   // Adding zero_point before or after rounding can make a difference
   // in exactly halfway cases.
-  if (LEGACY) {
+  if constexpr (LEGACY) {
     transformed_val = std::nearbyint(zero_point + transformed_val);
   } else {
     transformed_val = zero_point + std::nearbyint(transformed_val);

diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -270,8 +270,8 @@ FBGEMM_API thread_type_t fbgemmGetThreadPartition(
     int g,
     int m,
     int n,
-    int num_threads,
     int thread_id,
+    int num_threads,
     int n_align = 64);
 
 template <int SIZE, typename T = std::int32_t>

diff --git a/src/DirectConv.h b/src/DirectConv.h
@@ -9,7 +9,6 @@
 #pragma once
 #include <asmjit/x86.h> // @manual
 #include <cpuinfo.h>
-#include <cassert>
 #include <cstdint>
 #include <map>
 #include <mutex>

diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc
@@ -682,22 +682,21 @@ GenEmbeddingSpMDMLookup<
           a->imul(scratchReg1_, static_cast<asmjit::Imm>(fused_block_size));
 
           // broadcast the scale
-          x86::Mem scale_src, bias_src;
           constexpr unsigned int CACHE_LINE_LEN = 64;
           if constexpr (is_8bit_in) {
             if (scale_bias_last) {
-              scale_src = x86::dword_ptr(
+              auto scale_src = x86::dword_ptr(
                   input, scratchReg1_, 0, block_size * sizeof(uint8_t));
-              bias_src = x86::dword_ptr(
+              auto bias_src = x86::dword_ptr(
                   input,
                   scratchReg1_,
                   0,
                   block_size * sizeof(uint8_t) + sizeof(float));
               a->vbroadcastss(scale_vreg, scale_src);
               a->vbroadcastss(bias_vreg, bias_src);
             } else {
-              scale_src = x86::word_ptr(input, scratchReg1_);
-              bias_src =
+              auto scale_src = x86::word_ptr(input, scratchReg1_);
+              auto bias_src =
                   x86::word_ptr(input, scratchReg1_, 0, sizeof(uint16_t));
               a->vpbroadcastw(scale_vreg.half(), scale_src);
               a->vpbroadcastw(bias_vreg.half(), bias_src);

diff --git a/src/EmbeddingSpMDMNBit.cc b/src/EmbeddingSpMDMNBit.cc
@@ -217,7 +217,7 @@ GenEmbeddingSpMDMNBitLookup<
                 ROWWISE_SPARSE>::jit_embedding_kernel {
         // TODO: Make this tunable
         int pref_dist = prefetch;
-        bool areIndices64b = is_same_v<indxType, int64_t>;
+        constexpr bool areIndices64b = is_same_v<indxType, int64_t>;
 
         asmjit::CodeHolder code;
         code.init(runtime().environment());
@@ -596,7 +596,7 @@ GenEmbeddingSpMDMNBitLookup<
           a->jl(LoopDataIndexEnd);
 
           // Array out of bound check
-          if (areIndices64b) {
+          if constexpr (areIndices64b) {
             a->mov(scratchReg1_, x86::qword_ptr(indices));
           } else {
             a->mov(scratchReg1_.r32(), x86::dword_ptr(indices));
@@ -643,7 +643,7 @@ GenEmbeddingSpMDMNBitLookup<
             a->cmp(scratchReg2_, index_size);
             a->jge(pref_dist_reset_start);
 
-            if (areIndices64b) {
+            if constexpr (areIndices64b) {
               a->mov(
                   scratchReg2_,
                   x86::qword_ptr(indices, pref_dist * sizeof(indxType)));
@@ -658,7 +658,7 @@ GenEmbeddingSpMDMNBitLookup<
             a->bind(pref_dist_reset_start);
             // things are not okay just get the current row
             // this can be improved to getting the max dist row.
-            if (areIndices64b) {
+            if constexpr (areIndices64b) {
               a->mov(scratchReg2_, x86::qword_ptr(indices));
             } else {
               a->mov(scratchReg2_.r32(), x86::dword_ptr(indices));

diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
@@ -143,8 +143,7 @@ void ExecuteKernel<
   const inst_set_t isa = fbgemmInstructionSet();
   switch (isa) {
     case inst_set_t::avx512_vnni:
-      if constexpr (std::is_same_v<
-                        typename packingAMatrix::accType,
+      if constexpr (std::is_same_v<typename packingAMatrix::accType,
                         std::int16_t>) {
         // For AVX512VNNI, we redirect int16_t to int32_t accumulation.
         CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
@@ -224,8 +223,7 @@ void ExecuteKernel<
       if (nc != nbSize_) {
         switch (isa) {
           case inst_set_t::avx512_vnni:
-            if (std::
-                    is_same_v<typename packingAMatrix::accType, std::int16_t>) {
+            if constexpr (std::is_same_v<typename packingAMatrix::accType, std::int16_t>) {
               // For AVX512VNNI, we redirect int16_t to int32_t accumulation.
               CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
               fn = codeObj.getOrCreate<inst_set_t::avx512_vnni>(
@@ -237,8 +235,7 @@ void ExecuteKernel<
             break;
 
           case inst_set_t::avx512_vnni_ymm:
-            if (std::
-                    is_same_v<typename packingAMatrix::accType, std::int16_t>) {
+            if constexpr (std::is_same_v<typename packingAMatrix::accType, std::int16_t>) {
               // For AVX512VNNI, we redirect int16_t to int32_t accumulation.
               CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
               fn = codeObj.getOrCreate<inst_set_t::avx512_vnni_ymm>(

diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
@@ -128,15 +128,10 @@ void fbgemmPacked(
   // remainders
   int _kc = KDimPerGroup % KCB;
 
-  int kc = 0, mc = 0;
-
-  block_type_t blockA{0, 0, 0, 0};
-
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-  std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start,
-      t_start, t_end;
+  std::chrono::time_point<std::chrono::high_resolution_clock> t_end;
   double dt;
-  t_start = std::chrono::high_resolution_clock::now();
+  auto t_start = std::chrono::high_resolution_clock::now();
   t_very_start = std::chrono::high_resolution_clock::now();
 #endif
 
@@ -166,11 +161,11 @@ void fbgemmPacked(
             th_info,
             blocking_params);
     for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
-      mc = std::min(i_end - i, MCB);
+      int mc = std::min(i_end - i, MCB);
       for (int kb = 0; kb < kBlocks; ++kb) { // kb is the block index
-        kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
+        int kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
         // pack A matrix
-        blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
+        block_type_t blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
         packA.pack(blockA);
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN

diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
@@ -271,7 +271,8 @@ int fbgemmConv(
     case optimized_conv_t::directconv: {
       // specialized direct convolution path
       // std::cout << "Directconv fast path" << std::endl;
-      fbgemmDirectConv<SPATIAL_DIM, processOutputType::QGRANType>(
+      if constexpr (SPATIAL_DIM == 2) {
+        fbgemmDirectConv<SPATIAL_DIM, processOutputType::QGRANType>(
           conv_p,
           // Aint8,
           activations,
@@ -282,6 +283,9 @@ int fbgemmConv(
           outProcess.getBias(),
           thread_id,
           num_threads);
+      } else {
+        assert(false && "1d/3d direct conv not supported");
+      }
       break;
     }
     case optimized_conv_t::fastpath1d: {

diff --git a/src/FbgemmI64.cc b/src/FbgemmI64.cc
@@ -437,8 +437,8 @@ void cblas_gemm_i64_i64acc(
         false /* accum */, MCB, NCB, KCB);
   }
 
-  vector<int64_t> At, Bt;
   // TODO: handle transpose during packing
+  vector<int64_t> At;
   if (transa == matrix_op_t::Transpose) {
     At.resize(M * K);
     for (int i = 0; i < M; ++i) {
@@ -449,6 +449,7 @@ void cblas_gemm_i64_i64acc(
     A = At.data();
     lda = K;
   }
+  vector<int64_t> Bt;
   if (transb == matrix_op_t::Transpose) {
     Bt.resize(K * N);
     for (int k = 0; k < K; ++k) {