-
Notifications
You must be signed in to change notification settings - Fork 1
Memcpy Sample
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Apr 18, 2022
·
9 revisions
Test system: FX8150 (3.6GHz, no turbo), 1333MHz DDR3 single-channel RAM, Ubuntu 18.04LTS, GCC 10.0 with vectorization-helper flags enabled.
This no-compute algorithm makes kernel-launch latency (caused by work scheduling computations) visible. A real-world algorithm should have enough work per SIMD group (kernel body) to be efficient enough.
#include "VectorizedKernel.h"
#include<vector>
int main()
{
// memcpy kernel
constexpr int simd = 8; // good enough for FX8150 CPU (AVX1 + some pipelining)
auto kernelGatherScatter = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
const int currentSimdWidth = factory.width;
auto tmp = factory.template generate<float>();
tmp.readFrom(bufferIn,idThread); // gather
tmp.writeTo(bufferOut,idThread); // scatter
},Vectorization::KernelArgs<float*,float*>{});
auto kernelContiguous = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
const int currentSimdWidth = factory.width;
auto tmp = factory.template generate<float>();
tmp.readFromContiguous(bufferIn,idThread); // contiguous (first work-item in simd group decides where to read)
tmp.writeToContiguous(bufferOut,idThread); // contiguous (first work-item in simd group decides where to write)
},Vectorization::KernelArgs<float*,float*>{});
// does not have to be multiple of simd. tail is computed with simd=1 automatically.
for(int j=0;j<25;j++)
{
const int n = std::pow(2,j);
std::vector<float> test1(n),test2(n);
for(int i=0;i<n;i++)
{
test1[i]=i;
}
const int repeats = 5;
std::cout<< "Gather + scatter: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
{
Vectorization::Bench bench(0);
for(int k=0;k<5;k++)
{
kernelGatherScatter.run(n,test1.data(),test2.data());
}
}
}
for(int j=0;j<25;j++)
{
const int n = std::pow(2,j);
std::vector<float> test1(n),test2(n);
for(int i=0;i<n;i++)
{
test1[i]=i;
}
const int repeats = 5;
std::cout<< "Contiguous read/write: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
{
Vectorization::Bench bench(0);
for(int k=0;k<5;k++)
{
kernelContiguous.run(n,test1.data(),test2.data());
}
}
}
return 0;
}
output:
Gather + scatter: (size = 20 bytes) 3.74e-07 seconds
Gather + scatter: (size = 40 bytes) 1.48e-07 seconds
Gather + scatter: (size = 80 bytes) 1.75e-07 seconds
Gather + scatter: (size = 160 bytes) 1.56e-07 seconds
Gather + scatter: (size = 320 bytes) 1.7e-07 seconds
Gather + scatter: (size = 640 bytes) 1.88e-07 seconds
Gather + scatter: (size = 1280 bytes) 1.94e-07 seconds
Gather + scatter: (size = 2560 bytes) 2.43e-07 seconds
Gather + scatter: (size = 5120 bytes) 3.2e-07 seconds
Gather + scatter: (size = 10240 bytes) 6.99e-07 seconds
Gather + scatter: (size = 20480 bytes) 1.561e-06 seconds
Gather + scatter: (size = 40960 bytes) 3.76e-06 seconds
Gather + scatter: (size = 81920 bytes) 1.3459e-05 seconds
Gather + scatter: (size = 163840 bytes) 2.3698e-05 seconds
Gather + scatter: (size = 327680 bytes) 4.4843e-05 seconds
Gather + scatter: (size = 655360 bytes) 5.5269e-05 seconds
Gather + scatter: (size = 1310720 bytes) 0.00011429 seconds
Gather + scatter: (size = 2621440 bytes) 0.000269901 seconds
Gather + scatter: (size = 5242880 bytes) 0.000786592 seconds
Gather + scatter: (size = 10485760 bytes) 0.00201955 seconds
Gather + scatter: (size = 20971520 bytes) 0.00420292 seconds
Gather + scatter: (size = 41943040 bytes) 0.0183857 seconds
Gather + scatter: (size = 83886080 bytes) 0.0352718 seconds
Gather + scatter: (size = 167772160 bytes) 0.0690292 seconds
Gather + scatter: (size = 335544320 bytes) 0.139878 seconds
Contiguous read/write: (size = 20 bytes) 2.02e-07 seconds
Contiguous read/write: (size = 40 bytes) 1.09e-07 seconds
Contiguous read/write: (size = 80 bytes) 9.9e-08 seconds
Contiguous read/write: (size = 160 bytes) 9.2e-08 seconds
Contiguous read/write: (size = 320 bytes) 1.19e-07 seconds
Contiguous read/write: (size = 640 bytes) 1.34e-07 seconds
Contiguous read/write: (size = 1280 bytes) 1.57e-07 seconds
Contiguous read/write: (size = 2560 bytes) 1.69e-07 seconds
Contiguous read/write: (size = 5120 bytes) 2.24e-07 seconds
Contiguous read/write: (size = 10240 bytes) 3.55e-07 seconds
Contiguous read/write: (size = 20480 bytes) 9.09e-07 seconds
Contiguous read/write: (size = 40960 bytes) 2.597e-06 seconds
Contiguous read/write: (size = 81920 bytes) 7.853e-06 seconds
Contiguous read/write: (size = 163840 bytes) 1.7595e-05 seconds
Contiguous read/write: (size = 327680 bytes) 3.4657e-05 seconds
Contiguous read/write: (size = 655360 bytes) 6.4849e-05 seconds
Contiguous read/write: (size = 1310720 bytes) 0.000130463 seconds
Contiguous read/write: (size = 2621440 bytes) 0.000257956 seconds
Contiguous read/write: (size = 5242880 bytes) 0.00075907 seconds
Contiguous read/write: (size = 10485760 bytes) 0.00219717 seconds
Contiguous read/write: (size = 20971520 bytes) 0.00463505 seconds
Contiguous read/write: (size = 41943040 bytes) 0.0195087 seconds
Contiguous read/write: (size = 83886080 bytes) 0.0366837 seconds
Contiguous read/write: (size = 167772160 bytes) 0.0685079 seconds
Contiguous read/write: (size = 335544320 bytes) 0.146371 seconds