Skip to content

Matrix Matrix Multiplication Sample

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Apr 18, 2022 · 8 revisions

Test system: FX8150 (3.6GHz no turbo) + single channel DDR3 1333MHz RAM

Number of computations = 2(add+mul) x matrixSize^3 (1/4 billion operations for 512x512 matrix)

#include "VectorizedKernel.h"
#include<vector>
int main()
{
	// C = A x A kernel
	constexpr int simd = 8;
	constexpr int matrixSize = 512;
	auto kernelMatrixMultiplication = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
		const int currentSimdWidth = factory.width;

		const auto ROW = idThread.div(matrixSize);
		const auto COL = idThread.modulus(matrixSize);
		auto tmpSum = factory.template generate<float>(0.0f);


		for(int i=0;i<matrixSize;i++)
		{
			const auto indexA = ROW.mul(matrixSize).add(i);
			const auto indexB = COL.add(i*matrixSize);
			auto A = factory.template generate<float>();
			auto B = factory.template generate<float>();
			A.readFrom(bufferIn, indexA); // A[ROW * N + i]
			B.readFrom(bufferIn, indexB); // A[i * N + COL]
			tmpSum = tmpSum.add(A.mul(B));
		}
		const auto writeC = ROW.mul(matrixSize).add(COL); // C
		tmpSum.writeTo(bufferOut,writeC);

	},Vectorization::KernelArgs<float*,float*>{});



	for(int j=0;j<25;j++)
	{

		std::vector<float> test1(matrixSize*matrixSize),test2(matrixSize*matrixSize);
		for(int i=0;i<matrixSize*matrixSize;i++)
		{
			test1[i]=2.0f;
		}


		std::cout<< "matrix multiplication ("<<matrixSize*(size_t)matrixSize*matrixSize*2<<" operations total) ";
		{
			Vectorization::Bench bench(0);
			kernelMatrixMultiplication.run(matrixSize * matrixSize,test1.data(),test2.data());

		}

		for(int i=0;i<3;i++)
		{
			std::cout<<test2[i]<<" ";
		}
		std::cout<<std::endl;
	}


	return 0;
}

output:

matrix multiplication (268435456 operations total) 0.148851 seconds
2048 2048 2048 
matrix multiplication (268435456 operations total) 0.135291 seconds
2048 2048 2048 
matrix multiplication (268435456 operations total) 0.13494 seconds
2048 2048 2048 
matrix multiplication (268435456 operations total) 0.135016 seconds
2048 2048 2048 
matrix multiplication (268435456 operations total) 0.134685 seconds

this is equivalent to 2GFLOPS. (non-optimized simple loop on single-thread). Increases to 2.5 GFLOPS (0.105 seconds) with simd=4 instead of simd=8.

Clone this wiki locally