-
Notifications
You must be signed in to change notification settings - Fork 1
Matrix Matrix Multiplication Sample
Hüseyin Tuğrul BÜYÜKIŞIK edited this page May 13, 2023
·
8 revisions
Test system: FX8150 (3.6GHz no turbo) + single channel DDR3 1333MHz RAM
Number of computations = 2(add+mul) x matrixSize^3 (1/4 billion operations for 512x512 matrix)
#include "VectorizedKernel.h"
#include<vector>
int main()
{
// C = A x A kernel
constexpr int simd = 64;
constexpr int matrixSize = 512;
auto kernelMatrixMultiplication = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
const int currentSimdWidth = factory.width;
auto ROW = factory.template generate<int>();
idThread.div(matrixSize, ROW);
auto COL = factory.template generate<int>();
idThread.modulus(matrixSize, COL);
auto tmpSum = factory.template generate<float>(0.0f);
auto A = factory.template generate<float>(0.0f);
auto B = factory.template generate<float>(0.0f);
auto AmulB = factory.template generate<float>(0.0f);
auto indexA = factory.template generate<int>(0);
auto indexB = factory.template generate<int>(0);
for(int i=0;i<matrixSize;i++)
{
// compute index of A element
ROW.mul(matrixSize, indexA);
indexA.add(i,indexA);
// compute index of B element
COL.add(i*matrixSize, indexB);
// read A and B
A.readFrom(bufferIn, indexA); // A[ROW * N + i] --> gather operation
B.readFrom(bufferIn, indexB); // A[i * N + COL] --> contiguous
// multiply
A.mul(B,AmulB);
// add
tmpSum.add(AmulB, tmpSum);
}
// write C index
auto writeC = factory.template generate<int>();
ROW.mul(matrixSize, writeC);
writeC.add(COL, writeC);
tmpSum.writeTo(bufferOut,writeC);
},Vectorization::KernelArgs<float*,float*>{});
for(int j=0;j<25;j++)
{
std::vector<float> test1(matrixSize*matrixSize),test2(matrixSize*matrixSize);
for(int i=0;i<matrixSize*matrixSize;i++)
{
test1[i]=2.0f;
}
std::cout<< "matrix multiplication ("<<matrixSize*(size_t)matrixSize*matrixSize*2<<" operations total) ";
{
Vectorization::Bench bench(0);
kernelMatrixMultiplication.run(matrixSize * matrixSize,test1.data(),test2.data());
}
for(int i=0;i<3;i++)
{
std::cout<<test2[i]<<" ";
}
std::cout<<std::endl;
}
return 0;
}
Output on FX8150 CPU:
matrix multiplication (268435456 operations total) 0.101376 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.0926768 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.0928444 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.092645 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.0928248 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.0924905 seconds
this is equivalent to 2.9GFLOPS or ~1.5 elements computed per nanosecond.
Output on Ryzen 7900 CPU (simd=16):
matrix multiplication (268435456 operations total) 0.0332312 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.0332552 seconds
2048 2048 2048
matrix multiplication (268435456 operations total) 0.0333451 seconds
2048 2048 2048
This is inefficient, non-cache-friendly, unoptimized version of matrix-matrix multiplication. For godbolt.org cascadelake server, it completes in 0.044 seconds or ~6GFLOPS which is much lower than theoretical maximum.