Skip to content

Various fixes #4529

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,17 @@ cppcoreguidelines-special-member-functions,
bugprone*,
-bugprone-crtp-constructor-accessibility,
-bugprone-easily-swappable-parameters,
-bugprone-exception-escape,
-bugprone-implicit*,
-bugprone-macro-parentheses,
-bugprone-narrowing-conversions,
-bugprone-reserved-identifier,
-bugprone-signed-char-misuse,
-bugprone-switch-missing-default-case,
misc-*,
-misc-confusable-identifiers,
-misc-const-correctness,
-misc-include-cleaner,
-misc-no-recursion,
-misc-non-private-member-variables-in-classes,
-misc-use-anonymous-namespace,
-misc-unused-parameters,
modernize*,
-modernize-avoid-c-arrays,
-modernize-avoid-bind,
Expand All @@ -49,7 +45,6 @@ readability*,
-readability-avoid-unconditional-preprocessor-if,
-readability-braces-around-statements,
-readability-container-contains,
-readability-convert-member-functions-to-static,
-readability-else-after-return,
-readability-function-cognitive-complexity,
-readability-identifier-length,
Expand Down
3 changes: 1 addition & 2 deletions bench/BenchUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,10 @@ void performance_test(
int num_instances,
bool flush,
int repetitions,
bool is_mkl) {
bool is_mkl [[maybe_unused]]) {
#ifdef USE_MKL
mkl_set_xerbla((XerblaEntry)test_xerbla);
#endif
(void)is_mkl; // Suppress unused variable warning

float alpha = 1.f, beta = 1.f;
matrix_op_t btran = matrix_op_t::Transpose;
Expand Down
1 change: 1 addition & 0 deletions bench/DepthwiseBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstdio>
Expand Down
1 change: 0 additions & 1 deletion bench/I8SpmdmBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cstdlib>
#include <iostream>
Expand Down
1 change: 0 additions & 1 deletion include/fbgemm/FbgemmFP16.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
// upgraded to match with new fbgemm interface.

#include <cpuinfo.h>
#include <cassert>

#include "./FbgemmPackMatrixB.h" // @manual
#include "./FloatConversion.h" // @manual
Expand Down
1 change: 0 additions & 1 deletion include/fbgemm/FbgemmFP32.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
// upgraded to match with new fbgemm interface.

#include <cpuinfo.h>
#include <cassert>
#include <cstdlib>
#include <memory>
#include <stdexcept>
Expand Down
16 changes: 6 additions & 10 deletions include/fbgemm/FbgemmFPCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ FBGEMM_API void cblas_gemm_compute(
// autotuned kernel splits for various cases m = 1:mb_max
template <typename T>
void cblas_gemm_compute(
const matrix_op_t transa,
const matrix_op_t transa [[maybe_unused]],
const int m,
const float* A,
const PackedGemmMatrixB<T>& Bp,
Expand All @@ -130,7 +130,6 @@ void cblas_gemm_compute(
assert(cpuinfo_has_x86_f16c());
#endif
assert(transa == matrix_op_t::NoTranspose);
(void)transa; // Suppress unused variable warning

const auto iset = fbgemmInstructionSet();
// private scratchpad storage
Expand Down Expand Up @@ -170,12 +169,10 @@ void cblas_gemm_compute(
assert(mb < static_cast<int64_t>(partition.size()));
for (auto k_ind = 0; k_ind < k; k_ind += Bp.blockRowSize()) {
// set up proper accumulation to avoid "Nan" problem
float beta_ = NAN;
if (k_ind == 0) {
// accumulate of beta != 0.0
// do not!!! accumulate otherwise
beta_ = beta;
} else {
// accumulate of beta != 0.0
// do not!!! accumulate otherwise
float beta_ = beta;
if (k_ind != 0) {
// always accumulate with beta_ = 1.0f
beta_ = 1.0f;
}
Expand Down Expand Up @@ -278,8 +275,7 @@ void cblas_gemm_compute(
// use one thread to handle the fringe cases
if (thread_id == num_threads - 1) {
// leftover
const int rem = n - last_blk_col;
(void)rem; // Suppress unused variable warning
const int rem [[maybe_unused]] = n - last_blk_col;
assert(rem < Bp.blockColSize());

// small temporary buffer: the size should be larger than the
Expand Down
8 changes: 4 additions & 4 deletions include/fbgemm/FbgemmPackMatrixB.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,11 @@ class PackedGemmMatrixB {
uint64_t r = (uint64_t)r_;
uint64_t c = (uint64_t)c_;

uint64_t block_row_id = r / blockRowSize(),
brow_offset =
uint64_t block_row_id = r / blockRowSize();
uint64_t brow_offset =
(block_row_id * nbcol_) * (blockRowSize() * blockColSize());
uint64_t block_col_id = c / blockColSize(),
bcol_offset = block_col_id *
uint64_t block_col_id = c / blockColSize();
uint64_t bcol_offset = block_col_id *
((static_cast<int64_t>(block_row_id) != nbrow_ - 1)
? (blockRowSize() * blockColSize())
: (last_brow_ * blockColSize()));
Expand Down
4 changes: 2 additions & 2 deletions include/fbgemm/FloatConversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ template <typename Src, typename Tgt, RoundingMode RoundingMode>
tgt_mantissa += 1;
}
} else {
assert(RoundingMode == RoundingMode::ToZero);
static_assert(RoundingMode == RoundingMode::ToZero);
}
return tgt_sign | tgt_mantissa; // tgt_exponent == 0
} else {
Expand Down Expand Up @@ -205,7 +205,7 @@ template <typename Src, typename Tgt, RoundingMode RoundingMode>
}
}
} else {
assert(RoundingMode == RoundingMode::ToZero);
static_assert(RoundingMode == RoundingMode::ToZero);
}
return tgt_sign | tgt_exponent | tgt_mantissa;
}
Expand Down
12 changes: 5 additions & 7 deletions include/fbgemm/OutputProcessing-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,9 @@ ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE, outT, inT, nextOPType>::f(
} else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
int g = block.col_start / ncol_per_group;
Bq_zero_point_idx = g;
} else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
Bq_zero_point_idx = j;
} else {
assert(false && "unknown quantization granularity");
static_assert(Q_GRAN == QuantizationGranularity::OUT_CHANNEL);
Bq_zero_point_idx = j;
}
if (q_row_offsets_) {
raw -= q_row_offsets_[i - block.row_start] *
Expand Down Expand Up @@ -225,10 +224,9 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
} else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
int g = block.col_start / ncol_per_group;
Bq_zero_point_idx = g;
} else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
Bq_zero_point_idx = j;
} else {
assert(false && "unknown quantization granularity");
static_assert(Q_GRAN == QuantizationGranularity::OUT_CHANNEL);
Bq_zero_point_idx = j;
}
if (q_row_offsets_) {
raw -= q_row_offsets_[i - block.row_start] *
Expand All @@ -239,7 +237,7 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
res += bias_[j];
}
out[i * ld_out + j] = res;
if (FUSE_RELU) {
if constexpr (FUSE_RELU) {
out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion include/fbgemm/QuantUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ T Quantize(
// and nearbyint(2.5) is 2.0
// Adding zero_point before or after rounding can make a difference
// in exactly halfway cases.
if (LEGACY) {
if constexpr (LEGACY) {
transformed_val = std::nearbyint(zero_point + transformed_val);
} else {
transformed_val = zero_point + std::nearbyint(transformed_val);
Expand Down
2 changes: 1 addition & 1 deletion include/fbgemm/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ FBGEMM_API thread_type_t fbgemmGetThreadPartition(
int g,
int m,
int n,
int num_threads,
int thread_id,
int num_threads,
int n_align = 64);

template <int SIZE, typename T = std::int32_t>
Expand Down
1 change: 0 additions & 1 deletion src/DirectConv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#pragma once
#include <asmjit/x86.h> // @manual
#include <cpuinfo.h>
#include <cassert>
#include <cstdint>
#include <map>
#include <mutex>
Expand Down
9 changes: 4 additions & 5 deletions src/EmbeddingSpMDM.cc
Original file line number Diff line number Diff line change
Expand Up @@ -682,22 +682,21 @@ GenEmbeddingSpMDMLookup<
a->imul(scratchReg1_, static_cast<asmjit::Imm>(fused_block_size));

// broadcast the scale
x86::Mem scale_src, bias_src;
constexpr unsigned int CACHE_LINE_LEN = 64;
if constexpr (is_8bit_in) {
if (scale_bias_last) {
scale_src = x86::dword_ptr(
auto scale_src = x86::dword_ptr(
input, scratchReg1_, 0, block_size * sizeof(uint8_t));
bias_src = x86::dword_ptr(
auto bias_src = x86::dword_ptr(
input,
scratchReg1_,
0,
block_size * sizeof(uint8_t) + sizeof(float));
a->vbroadcastss(scale_vreg, scale_src);
a->vbroadcastss(bias_vreg, bias_src);
} else {
scale_src = x86::word_ptr(input, scratchReg1_);
bias_src =
auto scale_src = x86::word_ptr(input, scratchReg1_);
auto bias_src =
x86::word_ptr(input, scratchReg1_, 0, sizeof(uint16_t));
a->vpbroadcastw(scale_vreg.half(), scale_src);
a->vpbroadcastw(bias_vreg.half(), bias_src);
Expand Down
8 changes: 4 additions & 4 deletions src/EmbeddingSpMDMNBit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ GenEmbeddingSpMDMNBitLookup<
ROWWISE_SPARSE>::jit_embedding_kernel {
// TODO: Make this tunable
int pref_dist = prefetch;
bool areIndices64b = is_same_v<indxType, int64_t>;
constexpr bool areIndices64b = is_same_v<indxType, int64_t>;

asmjit::CodeHolder code;
code.init(runtime().environment());
Expand Down Expand Up @@ -596,7 +596,7 @@ GenEmbeddingSpMDMNBitLookup<
a->jl(LoopDataIndexEnd);

// Array out of bound check
if (areIndices64b) {
if constexpr (areIndices64b) {
a->mov(scratchReg1_, x86::qword_ptr(indices));
} else {
a->mov(scratchReg1_.r32(), x86::dword_ptr(indices));
Expand Down Expand Up @@ -643,7 +643,7 @@ GenEmbeddingSpMDMNBitLookup<
a->cmp(scratchReg2_, index_size);
a->jge(pref_dist_reset_start);

if (areIndices64b) {
if constexpr (areIndices64b) {
a->mov(
scratchReg2_,
x86::qword_ptr(indices, pref_dist * sizeof(indxType)));
Expand All @@ -658,7 +658,7 @@ GenEmbeddingSpMDMNBitLookup<
a->bind(pref_dist_reset_start);
// things are not okay just get the current row
// this can be improved to getting the max dist row.
if (areIndices64b) {
if constexpr (areIndices64b) {
a->mov(scratchReg2_, x86::qword_ptr(indices));
} else {
a->mov(scratchReg2_.r32(), x86::dword_ptr(indices));
Expand Down
9 changes: 3 additions & 6 deletions src/ExecuteKernelU8S8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,7 @@ void ExecuteKernel<
const inst_set_t isa = fbgemmInstructionSet();
switch (isa) {
case inst_set_t::avx512_vnni:
if constexpr (std::is_same_v<
typename packingAMatrix::accType,
if constexpr (std::is_same_v<typename packingAMatrix::accType,
std::int16_t>) {
// For AVX512VNNI, we redirect int16_t to int32_t accumulation.
CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
Expand Down Expand Up @@ -224,8 +223,7 @@ void ExecuteKernel<
if (nc != nbSize_) {
switch (isa) {
case inst_set_t::avx512_vnni:
if (std::
is_same_v<typename packingAMatrix::accType, std::int16_t>) {
if constexpr (std::is_same_v<typename packingAMatrix::accType, std::int16_t>) {
// For AVX512VNNI, we redirect int16_t to int32_t accumulation.
CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
fn = codeObj.getOrCreate<inst_set_t::avx512_vnni>(
Expand All @@ -237,8 +235,7 @@ void ExecuteKernel<
break;

case inst_set_t::avx512_vnni_ymm:
if (std::
is_same_v<typename packingAMatrix::accType, std::int16_t>) {
if constexpr (std::is_same_v<typename packingAMatrix::accType, std::int16_t>) {
// For AVX512VNNI, we redirect int16_t to int32_t accumulation.
CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
fn = codeObj.getOrCreate<inst_set_t::avx512_vnni_ymm>(
Expand Down
15 changes: 5 additions & 10 deletions src/Fbgemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,15 +128,10 @@ void fbgemmPacked(
// remainders
int _kc = KDimPerGroup % KCB;

int kc = 0, mc = 0;

block_type_t blockA{0, 0, 0, 0};

#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start,
t_start, t_end;
std::chrono::time_point<std::chrono::high_resolution_clock> t_end;
double dt;
t_start = std::chrono::high_resolution_clock::now();
auto t_start = std::chrono::high_resolution_clock::now();
t_very_start = std::chrono::high_resolution_clock::now();
#endif

Expand Down Expand Up @@ -166,11 +161,11 @@ void fbgemmPacked(
th_info,
blocking_params);
for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
mc = std::min(i_end - i, MCB);
int mc = std::min(i_end - i, MCB);
for (int kb = 0; kb < kBlocks; ++kb) { // kb is the block index
kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
int kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
// pack A matrix
blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
block_type_t blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
packA.pack(blockA);

#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
Expand Down
6 changes: 5 additions & 1 deletion src/FbgemmConv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ int fbgemmConv(
case optimized_conv_t::directconv: {
// specialized direct convolution path
// std::cout << "Directconv fast path" << std::endl;
fbgemmDirectConv<SPATIAL_DIM, processOutputType::QGRANType>(
if constexpr (SPATIAL_DIM == 2) {
fbgemmDirectConv<SPATIAL_DIM, processOutputType::QGRANType>(
conv_p,
// Aint8,
activations,
Expand All @@ -282,6 +283,9 @@ int fbgemmConv(
outProcess.getBias(),
thread_id,
num_threads);
} else {
assert(false && "1d/3d direct conv not supported");
}
break;
}
case optimized_conv_t::fastpath1d: {
Expand Down
3 changes: 2 additions & 1 deletion src/FbgemmI64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,8 @@ void cblas_gemm_i64_i64acc(
false /* accum */, MCB, NCB, KCB);
}

vector<int64_t> At, Bt;
// TODO: handle transpose during packing
vector<int64_t> At;
if (transa == matrix_op_t::Transpose) {
At.resize(M * K);
for (int i = 0; i < M; ++i) {
Expand All @@ -449,6 +449,7 @@ void cblas_gemm_i64_i64acc(
A = At.data();
lda = K;
}
vector<int64_t> Bt;
if (transb == matrix_op_t::Transpose) {
Bt.resize(K * N);
for (int k = 0; k < K; ++k) {
Expand Down
Loading
Loading