Skip to content

Memcpy Sample

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Apr 18, 2022 · 9 revisions

Test system: FX8150 (3.6GHz, no turbo), 1333MHz DDR3 single-channel RAM, Ubuntu 18.04LTS, GCC 10.0 with vectorization-helper flags enabled.

This no-compute algorithm makes kernel-launch latency (caused by work scheduling computations) visible. A real-world algorithm should have enough work per SIMD group (kernel body) to be efficient enough.

#include "VectorizedKernel.h"
#include<vector>
int main()
{
	// memcpy kernel
	constexpr int simd = 8; // good enough for FX8150 CPU (AVX1 + some pipelining)
	auto kernelGatherScatter = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){

		const int currentSimdWidth = factory.width;
		auto tmp = factory.template generate<float>();
		tmp.readFrom(bufferIn,idThread); // gather
		tmp.writeTo(bufferOut,idThread); // scatter

	},Vectorization::KernelArgs<float*,float*>{});

	auto kernelContiguous = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){

		const int currentSimdWidth = factory.width;
		auto tmp = factory.template generate<float>();
		tmp.readFromContiguous(bufferIn,idThread); // contiguous (first work-item in simd group decides where to read)
		tmp.writeToContiguous(bufferOut,idThread); // contiguous (first work-item in simd group decides where to write)

	},Vectorization::KernelArgs<float*,float*>{});

	// does not have to be multiple of simd. tail is computed with simd=1 automatically.
	for(int j=0;j<25;j++)
	{
		const int n = std::pow(2,j);
		std::vector<float> test1(n),test2(n);
		for(int i=0;i<n;i++)
		{
			test1[i]=i;
		}

		const int repeats = 5;
		std::cout<< "Gather + scatter: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
		{
			Vectorization::Bench bench(0);
			for(int k=0;k<5;k++)
			{
				kernelGatherScatter.run(n,test1.data(),test2.data());
			}
		}


	}

	for(int j=0;j<25;j++)
	{
		const int n = std::pow(2,j);
		std::vector<float> test1(n),test2(n);
		for(int i=0;i<n;i++)
		{
			test1[i]=i;
		}

		const int repeats = 5;
		std::cout<< "Contiguous read/write: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
		{
			Vectorization::Bench bench(0);
			for(int k=0;k<5;k++)
			{
				kernelContiguous.run(n,test1.data(),test2.data());
			}
		}


	}

	return 0;
}

output:

Gather + scatter: (size = 20 bytes) 3.74e-07 seconds
Gather + scatter: (size = 40 bytes) 1.48e-07 seconds
Gather + scatter: (size = 80 bytes) 1.75e-07 seconds
Gather + scatter: (size = 160 bytes) 1.56e-07 seconds
Gather + scatter: (size = 320 bytes) 1.7e-07 seconds
Gather + scatter: (size = 640 bytes) 1.88e-07 seconds
Gather + scatter: (size = 1280 bytes) 1.94e-07 seconds
Gather + scatter: (size = 2560 bytes) 2.43e-07 seconds
Gather + scatter: (size = 5120 bytes) 3.2e-07 seconds
Gather + scatter: (size = 10240 bytes) 6.99e-07 seconds
Gather + scatter: (size = 20480 bytes) 1.561e-06 seconds
Gather + scatter: (size = 40960 bytes) 3.76e-06 seconds
Gather + scatter: (size = 81920 bytes) 1.3459e-05 seconds
Gather + scatter: (size = 163840 bytes) 2.3698e-05 seconds
Gather + scatter: (size = 327680 bytes) 4.4843e-05 seconds
Gather + scatter: (size = 655360 bytes) 5.5269e-05 seconds
Gather + scatter: (size = 1310720 bytes) 0.00011429 seconds
Gather + scatter: (size = 2621440 bytes) 0.000269901 seconds
Gather + scatter: (size = 5242880 bytes) 0.000786592 seconds
Gather + scatter: (size = 10485760 bytes) 0.00201955 seconds
Gather + scatter: (size = 20971520 bytes) 0.00420292 seconds
Gather + scatter: (size = 41943040 bytes) 0.0183857 seconds
Gather + scatter: (size = 83886080 bytes) 0.0352718 seconds
Gather + scatter: (size = 167772160 bytes) 0.0690292 seconds
Gather + scatter: (size = 335544320 bytes) 0.139878 seconds
Contiguous read/write: (size = 20 bytes) 2.02e-07 seconds
Contiguous read/write: (size = 40 bytes) 1.09e-07 seconds
Contiguous read/write: (size = 80 bytes) 9.9e-08 seconds
Contiguous read/write: (size = 160 bytes) 9.2e-08 seconds
Contiguous read/write: (size = 320 bytes) 1.19e-07 seconds
Contiguous read/write: (size = 640 bytes) 1.34e-07 seconds
Contiguous read/write: (size = 1280 bytes) 1.57e-07 seconds
Contiguous read/write: (size = 2560 bytes) 1.69e-07 seconds
Contiguous read/write: (size = 5120 bytes) 2.24e-07 seconds
Contiguous read/write: (size = 10240 bytes) 3.55e-07 seconds
Contiguous read/write: (size = 20480 bytes) 9.09e-07 seconds
Contiguous read/write: (size = 40960 bytes) 2.597e-06 seconds
Contiguous read/write: (size = 81920 bytes) 7.853e-06 seconds
Contiguous read/write: (size = 163840 bytes) 1.7595e-05 seconds
Contiguous read/write: (size = 327680 bytes) 3.4657e-05 seconds
Contiguous read/write: (size = 655360 bytes) 6.4849e-05 seconds
Contiguous read/write: (size = 1310720 bytes) 0.000130463 seconds
Contiguous read/write: (size = 2621440 bytes) 0.000257956 seconds
Contiguous read/write: (size = 5242880 bytes) 0.00075907 seconds
Contiguous read/write: (size = 10485760 bytes) 0.00219717 seconds
Contiguous read/write: (size = 20971520 bytes) 0.00463505 seconds
Contiguous read/write: (size = 41943040 bytes) 0.0195087 seconds
Contiguous read/write: (size = 83886080 bytes) 0.0366837 seconds
Contiguous read/write: (size = 167772160 bytes) 0.0685079 seconds
Contiguous read/write: (size = 335544320 bytes) 0.146371 seconds
Clone this wiki locally