-
Notifications
You must be signed in to change notification settings - Fork 1
Matrix Matrix Multiplication Sample
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Apr 18, 2022
·
8 revisions
Test system: FX8150 (3.6GHz no turbo) + single channel DDR3 1333MHz RAM
Number of computations = 2(add+mul) x matrixSize^3 (1/4 billion operations for 512x512 matrix)
#include "VectorizedKernel.h"
#include<vector>
int main()
{
// C = A x A kernel
constexpr int simd = 8;
constexpr int matrixSize = 512;
auto kernelMatrixMultiplication = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
const int currentSimdWidth = factory.width;
const auto ROW = idThread.div(matrixSize);
const auto COL = idThread.modulus(matrixSize);
auto tmpSum = factory.template generate<float>(0.0f);
for(int i=0;i<matrixSize;i++)
{
const auto indexA = ROW.mul(matrixSize).add(i);
const auto indexB = COL.add(i*matrixSize);
auto A = factory.template generate<float>();
auto B = factory.template generate<float>();
A.readFrom(bufferIn, indexA); // A[ROW * N + i]
B.readFrom(bufferIn, indexB); // A[i * N + COL]
tmpSum = tmpSum.add(A.mul(B));
}
const auto writeC = ROW.mul(matrixSize).add(COL); // C
tmpSum.writeTo(bufferOut,writeC);
},Vectorization::KernelArgs<float*,float*>{});
for(int j=0;j<25;j++)
{
std::vector<float> test1(matrixSize*matrixSize),test2(matrixSize*matrixSize);
for(int i=0;i<matrixSize*matrixSize;i++)
{
test1[i]=2.0f;
}
std::cout<< "matrix multiplication ("<<matrixSize*(size_t)matrixSize*matrixSize*2<<" operations total) ";
{
Vectorization::Bench bench(0);
kernelMatrixMultiplication.run(matrixSize * matrixSize,test1.data(),test2.data());
}
for(int i=0;i<3;i++)
{
std::cout<<test2[i]<<" ";
}
std::cout<<std::endl;
}
return 0;
}
output:
matrix multiplication (268435456 operations total) 0.148851 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.135291 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.13494 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.135016 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.134685 seconds
this is equivalent to 2GFLOPS. (non-optimized simple loop on single-thread). Increases to 2.5 GFLOPS (0.105 seconds) with simd=4 instead of simd=8.