-
Notifications
You must be signed in to change notification settings - Fork 1
Memcpy Sample
Hüseyin Tuğrul BÜYÜKIŞIK edited this page May 13, 2023
·
9 revisions
Test system: FX8150 (3.6GHz, no turbo), 1333MHz DDR3 single-channel RAM, Ubuntu 18.04LTS, GCC 10.0 with vectorization-helper flags enabled.
This no-compute algorithm makes kernel-launch latency (caused by work scheduling computations) visible. A real-world algorithm should have enough work per SIMD group (kernel body) to be efficient enough.
#include "VectorizedKernel.h"
#include<vector>
int main()
{
// memcpy kernel
constexpr int simd = 8; // good enough for FX8150 CPU (AVX1 + some pipelining)
auto kernelGatherScatter = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
const int currentSimdWidth = factory.width;
auto tmp = factory.template generate<float>();
tmp.readFrom(bufferIn,idThread); // gather
tmp.writeTo(bufferOut,idThread); // scatter
},Vectorization::KernelArgs<float*,float*>{});
auto kernelContiguous = Vectorization::createKernel<simd>([&](auto & factory, auto & idThread, float * bufferIn, float * bufferOut){
const int currentSimdWidth = factory.width;
auto tmp = factory.template generate<float>();
tmp.readFromContiguous(bufferIn,idThread); // contiguous (first work-item in simd group decides where to read)
tmp.writeToContiguous(bufferOut,idThread); // contiguous (first work-item in simd group decides where to write)
},Vectorization::KernelArgs<float*,float*>{});
for(int j=0;j<25;j++)
{
// does not have to be multiple of simd. tail is computed with simd=1 automatically.
const int n = std::pow(2,j);
std::vector<float> test1(n),test2(n);
for(int i=0;i<n;i++)
{
test1[i]=i;
}
const int repeats = 5;
std::cout<< "Gather + scatter: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
{
Vectorization::Bench bench(0);
for(int k=0;k<5;k++)
{
kernelGatherScatter.run(n,test1.data(),test2.data());
}
}
}
for(int j=0;j<25;j++)
{
const int n = std::pow(2,j);
std::vector<float> test1(n),test2(n);
for(int i=0;i<n;i++)
{
test1[i]=i;
}
const int repeats = 5;
std::cout<< "Contiguous read/write: (size = "<<(sizeof(float) * n * repeats) << " bytes) ";
{
Vectorization::Bench bench(0);
for(int k=0;k<repeats;k++)
{
kernelContiguous.run(n,test1.data(),test2.data());
}
}
}
return 0;
}
output on single-channel memory of FX8150:
Gather + scatter: (size = 20 bytes) 2.78e-07 seconds
Gather + scatter: (size = 40 bytes) 1.36e-07 seconds
Gather + scatter: (size = 80 bytes) 1.63e-07 seconds
Gather + scatter: (size = 160 bytes) 9.5e-08 seconds
Gather + scatter: (size = 320 bytes) 1.37e-07 seconds
Gather + scatter: (size = 640 bytes) 1.65e-07 seconds
Gather + scatter: (size = 1280 bytes) 1.55e-07 seconds
Gather + scatter: (size = 2560 bytes) 1.86e-07 seconds
Gather + scatter: (size = 5120 bytes) 2.94e-07 seconds
Gather + scatter: (size = 10240 bytes) 3.76e-07 seconds
Gather + scatter: (size = 20480 bytes) 1.073e-06 seconds <-- ~20 GB/s
Gather + scatter: (size = 40960 bytes) 3.692e-06 seconds
Gather + scatter: (size = 81920 bytes) 9.238e-06 seconds
Gather + scatter: (size = 163840 bytes) 1.6645e-05 seconds
Gather + scatter: (size = 327680 bytes) 4.6444e-05 seconds
Gather + scatter: (size = 655360 bytes) 4.2665e-05 seconds
Gather + scatter: (size = 1310720 bytes) 8.9339e-05 seconds
Gather + scatter: (size = 2621440 bytes) 0.000178935 seconds
Gather + scatter: (size = 5242880 bytes) 0.000652814 seconds
Gather + scatter: (size = 10485760 bytes) 0.00194283 seconds
Gather + scatter: (size = 20971520 bytes) 0.00556092 seconds
Gather + scatter: (size = 41943040 bytes) 0.0199376 seconds
Gather + scatter: (size = 83886080 bytes) 0.0383999 seconds
Gather + scatter: (size = 167772160 bytes) 0.0734081 seconds
Gather + scatter: (size = 335544320 bytes) 0.150067 seconds
Contiguous read/write: (size = 20 bytes) 3.05e-07 seconds
Contiguous read/write: (size = 40 bytes) 1.37e-07 seconds
Contiguous read/write: (size = 80 bytes) 1.82e-07 seconds
Contiguous read/write: (size = 160 bytes) 9.2e-08 seconds
Contiguous read/write: (size = 320 bytes) 1.2e-07 seconds
Contiguous read/write: (size = 640 bytes) 1.23e-07 seconds
Contiguous read/write: (size = 1280 bytes) 1.5e-07 seconds
Contiguous read/write: (size = 2560 bytes) 2e-07 seconds
Contiguous read/write: (size = 5120 bytes) 2.31e-07 seconds
Contiguous read/write: (size = 10240 bytes) 4.99e-07 seconds <-- 20 GB/s
Contiguous read/write: (size = 20480 bytes) 1.117e-06 seconds
Contiguous read/write: (size = 40960 bytes) 3.004e-06 seconds
Contiguous read/write: (size = 81920 bytes) 9.371e-06 seconds
Contiguous read/write: (size = 163840 bytes) 1.6818e-05 seconds
Contiguous read/write: (size = 327680 bytes) 4.3517e-05 seconds
Contiguous read/write: (size = 655360 bytes) 8.6749e-05 seconds
Contiguous read/write: (size = 1310720 bytes) 0.000173887 seconds
Contiguous read/write: (size = 2621440 bytes) 0.00034889 seconds
Contiguous read/write: (size = 5242880 bytes) 0.000937983 seconds
Contiguous read/write: (size = 10485760 bytes) 0.00225532 seconds
Contiguous read/write: (size = 20971520 bytes) 0.00487051 seconds
Contiguous read/write: (size = 41943040 bytes) 0.0199355 seconds
Contiguous read/write: (size = 83886080 bytes) 0.0392087 seconds
Contiguous read/write: (size = 167772160 bytes) 0.0747353 seconds
Contiguous read/write: (size = 335544320 bytes) 0.14468 seconds
Output on dual-channel memory of Ryzen 7900:
Gather + scatter: (size = 20 bytes) 1.1e-07 seconds
Gather + scatter: (size = 40 bytes) 4e-08 seconds
Gather + scatter: (size = 80 bytes) 4e-08 seconds
Gather + scatter: (size = 160 bytes) 6e-08 seconds
Gather + scatter: (size = 320 bytes) 5e-08 seconds
Gather + scatter: (size = 640 bytes) 5e-08 seconds
Gather + scatter: (size = 1280 bytes) 6e-08 seconds
Gather + scatter: (size = 2560 bytes) 5e-08 seconds
Gather + scatter: (size = 5120 bytes) 8e-08 seconds
Gather + scatter: (size = 10240 bytes) 1.1e-07 seconds ---> 93 GB/s
Gather + scatter: (size = 20480 bytes) 1.8e-07 seconds
Gather + scatter: (size = 40960 bytes) 3.6e-07 seconds
Gather + scatter: (size = 81920 bytes) 7.7e-07 seconds
Gather + scatter: (size = 163840 bytes) 2.08e-06 seconds
Gather + scatter: (size = 327680 bytes) 4.15e-06 seconds
Gather + scatter: (size = 655360 bytes) 7.94e-06 seconds
Gather + scatter: (size = 1310720 bytes) 1.683e-05 seconds
Gather + scatter: (size = 2621440 bytes) 4.008e-05 seconds
Gather + scatter: (size = 5242880 bytes) 8.584e-05 seconds
Gather + scatter: (size = 10485760 bytes) 0.00017342 seconds
Gather + scatter: (size = 20971520 bytes) 0.00035677 seconds
Gather + scatter: (size = 41943040 bytes) 0.000777581 seconds
Gather + scatter: (size = 83886080 bytes) 0.00267528 seconds
Gather + scatter: (size = 167772160 bytes) 0.00740086 seconds
Gather + scatter: (size = 335544320 bytes) 0.0165531 seconds
Contiguous read/write: (size = 20 bytes) 7e-08 seconds
Contiguous read/write: (size = 40 bytes) 5e-08 seconds
Contiguous read/write: (size = 80 bytes) 4e-08 seconds
Contiguous read/write: (size = 160 bytes) 6e-08 seconds
Contiguous read/write: (size = 320 bytes) 5e-08 seconds
Contiguous read/write: (size = 640 bytes) 4e-08 seconds
Contiguous read/write: (size = 1280 bytes) 5e-08 seconds
Contiguous read/write: (size = 2560 bytes) 6e-08 seconds
Contiguous read/write: (size = 5120 bytes) 8e-08 seconds
Contiguous read/write: (size = 10240 bytes) 1.1e-07 seconds ---> 93 GB/s (maximum ~340 GB/s per core on L1 cache read, ~ 170 GB/s write L1)
Contiguous read/write: (size = 20480 bytes) 1.8e-07 seconds
Contiguous read/write: (size = 40960 bytes) 3.5e-07 seconds ---> 117 GB/s ()
Contiguous read/write: (size = 81920 bytes) 7.5e-07 seconds
Contiguous read/write: (size = 163840 bytes) 2.05e-06 seconds
Contiguous read/write: (size = 327680 bytes) 4.13e-06 seconds
Contiguous read/write: (size = 655360 bytes) 8.3e-06 seconds
Contiguous read/write: (size = 1310720 bytes) 2.028e-05 seconds
Contiguous read/write: (size = 2621440 bytes) 4.781e-05 seconds
Contiguous read/write: (size = 5242880 bytes) 0.00012084 seconds
Contiguous read/write: (size = 10485760 bytes) 0.00026154 seconds
Contiguous read/write: (size = 20971520 bytes) 0.000465621 seconds
Contiguous read/write: (size = 41943040 bytes) 0.000776061 seconds
Contiguous read/write: (size = 83886080 bytes) 0.00281754 seconds
Contiguous read/write: (size = 167772160 bytes) 0.00730954 seconds
Contiguous read/write: (size = 335544320 bytes) 0.0167374 seconds ---> 20 GB/s (per core, only 4-5 cores enough to use full RAM bandwidth)
Efficient memcpy should have a loop from single work-group repeating same operation instead of launching many work-groups.