Memcpy Sample

Test system: FX8150 (3.6GHz, no turbo), 1333MHz DDR3 single-channel RAM, Ubuntu 18.04LTS, GCC 10.0 with vectorization-helper flags enabled.

This no-compute algorithm makes kernel-launch latency (caused by work scheduling computations) visible. A real-world algorithm should have enough work per SIMD group (kernel body) to be efficient enough.

#include "VectorizedKernel.h"
#include<vector>
int main()
{
	// memcpy kernel
	constexpr int simd = 8; // good enough for FX8150 CPU (AVX1 + some pipelining)
	auto kernelGatherScatter = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){

		const int currentSimdWidth = factory.width;
		auto tmp = factory.template generate<float>();
		tmp.readFrom(bufferIn,idThread); // gather
		tmp.writeTo(bufferOut,idThread); // scatter

	},Vectorization::KernelArgs<float*,float*>{});

	auto kernelContiguous = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){

		const int currentSimdWidth = factory.width;
		auto tmp = factory.template generate<float>();
		tmp.readFromContiguous(bufferIn,idThread); // contiguous (first work-item in simd group decides where to read)
		tmp.writeToContiguous(bufferOut,idThread); // contiguous (first work-item in simd group decides where to write)

	},Vectorization::KernelArgs<float*,float*>{});

	// does not have to be multiple of simd. tail is computed with simd=1 automatically.
	for(int j=0;j<25;j++)
	{
		const int n = std::pow(2,j);
		std::vector<float> test1(n),test2(n);
		for(int i=0;i<n;i++)
		{
			test1[i]=i;
		}

		const int repeats = 5;
		std::cout<< "Gather + scatter: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
		{
			Vectorization::Bench bench(0);
			for(int k=0;k<5;k++)
			{
				kernelGatherScatter.run(n,test1.data(),test2.data());
			}
		}


	}

	for(int j=0;j<25;j++)
	{
		const int n = std::pow(2,j);
		std::vector<float> test1(n),test2(n);
		for(int i=0;i<n;i++)
		{
			test1[i]=i;
		}

		const int repeats = 5;
		std::cout<< "Contiguous read/write: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
		{
			Vectorization::Bench bench(0);
			for(int k=0;k<5;k++)
			{
				kernelContiguous.run(n,test1.data(),test2.data());
			}
		}


	}

	return 0;
}

output:

Gather + scatter: (size = 20 bytes) 3.74e-07 seconds
Gather + scatter: (size = 40 bytes) 1.48e-07 seconds
Gather + scatter: (size = 80 bytes) 1.75e-07 seconds
Gather + scatter: (size = 160 bytes) 1.56e-07 seconds
Gather + scatter: (size = 320 bytes) 1.7e-07 seconds
Gather + scatter: (size = 640 bytes) 1.88e-07 seconds
Gather + scatter: (size = 1280 bytes) 1.94e-07 seconds
Gather + scatter: (size = 2560 bytes) 2.43e-07 seconds
Gather + scatter: (size = 5120 bytes) 3.2e-07 seconds
Gather + scatter: (size = 10240 bytes) 6.99e-07 seconds
Gather + scatter: (size = 20480 bytes) 1.561e-06 seconds
Gather + scatter: (size = 40960 bytes) 3.76e-06 seconds
Gather + scatter: (size = 81920 bytes) 1.3459e-05 seconds
Gather + scatter: (size = 163840 bytes) 2.3698e-05 seconds
Gather + scatter: (size = 327680 bytes) 4.4843e-05 seconds
Gather + scatter: (size = 655360 bytes) 5.5269e-05 seconds
Gather + scatter: (size = 1310720 bytes) 0.00011429 seconds
Gather + scatter: (size = 2621440 bytes) 0.000269901 seconds
Gather + scatter: (size = 5242880 bytes) 0.000786592 seconds
Gather + scatter: (size = 10485760 bytes) 0.00201955 seconds
Gather + scatter: (size = 20971520 bytes) 0.00420292 seconds
Gather + scatter: (size = 41943040 bytes) 0.0183857 seconds
Gather + scatter: (size = 83886080 bytes) 0.0352718 seconds
Gather + scatter: (size = 167772160 bytes) 0.0690292 seconds
Gather + scatter: (size = 335544320 bytes) 0.139878 seconds
Contiguous read/write: (size = 20 bytes) 2.02e-07 seconds
Contiguous read/write: (size = 40 bytes) 1.09e-07 seconds
Contiguous read/write: (size = 80 bytes) 9.9e-08 seconds
Contiguous read/write: (size = 160 bytes) 9.2e-08 seconds
Contiguous read/write: (size = 320 bytes) 1.19e-07 seconds
Contiguous read/write: (size = 640 bytes) 1.34e-07 seconds
Contiguous read/write: (size = 1280 bytes) 1.57e-07 seconds
Contiguous read/write: (size = 2560 bytes) 1.69e-07 seconds
Contiguous read/write: (size = 5120 bytes) 2.24e-07 seconds
Contiguous read/write: (size = 10240 bytes) 3.55e-07 seconds
Contiguous read/write: (size = 20480 bytes) 9.09e-07 seconds
Contiguous read/write: (size = 40960 bytes) 2.597e-06 seconds
Contiguous read/write: (size = 81920 bytes) 7.853e-06 seconds
Contiguous read/write: (size = 163840 bytes) 1.7595e-05 seconds
Contiguous read/write: (size = 327680 bytes) 3.4657e-05 seconds
Contiguous read/write: (size = 655360 bytes) 6.4849e-05 seconds
Contiguous read/write: (size = 1310720 bytes) 0.000130463 seconds
Contiguous read/write: (size = 2621440 bytes) 0.000257956 seconds
Contiguous read/write: (size = 5242880 bytes) 0.00075907 seconds
Contiguous read/write: (size = 10485760 bytes) 0.00219717 seconds
Contiguous read/write: (size = 20971520 bytes) 0.00463505 seconds
Contiguous read/write: (size = 41943040 bytes) 0.0195087 seconds
Contiguous read/write: (size = 83886080 bytes) 0.0366837 seconds
Contiguous read/write: (size = 167772160 bytes) 0.0685079 seconds
Contiguous read/write: (size = 335544320 bytes) 0.146371 seconds

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Memcpy Sample

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally