-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprefix-scan.cpp
118 lines (105 loc) · 3.34 KB
/
prefix-scan.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <vector>
#include <iostream>
#include <easyvk.h>
#include <cassert>
#include <vector>
#include <unistd.h>
#define BATCH_SIZE 8
void computeReferencePrefixSum(uint32_t* ref, int size) {
ref[0] = 0;
for (int i = 1; i < size; i++) {
ref[i] = ref[i - 1] + i;
}
}
int main(int argc, char* argv[]) {
int workgroupSize = 64;
int numWorkgroups = 32;
int deviceID = 0;
bool enableValidationLayers = false;
bool checkResults = false;
int c;
while ((c = getopt (argc, argv, "vct:w:d:")) != -1)
switch (c)
{
case 't':
workgroupSize = atoi(optarg);
break;
case 'w':
numWorkgroups = atoi(optarg);
break;
case 'v':
enableValidationLayers = true;
break;
case 'c':
checkResults = true;
break;
case 'd':
deviceID = atoi(optarg);
break;
case '?':
if (optopt == 't' || optopt == 'w')
std::cerr << "Option -" << optopt << "requires an argument\n";
else
std::cerr << "Unknown option" << optopt << "\n";
return 1;
default:
abort ();
}
auto size = numWorkgroups * workgroupSize * BATCH_SIZE;
// Initialize instance.
auto instance = easyvk::Instance(enableValidationLayers);
// Get list of available physical devices.
auto physicalDevices = instance.physicalDevices();
// Create device from first physical device.
auto device = easyvk::Device(instance, physicalDevices.at(deviceID));
std::cout << "Using device: " << device.properties.deviceName << "\n";
std::cout << "Device subgroup size: " << device.subgroupSize() << "\n";
// Define the buffers to use in the kernel.
auto in = easyvk::Buffer(device, size, sizeof(uint32_t));
auto out = easyvk::Buffer(device, size, sizeof(uint32_t));
auto prefixStates = easyvk::Buffer(device, numWorkgroups, 3*sizeof(uint32_t));
auto partitionCtr = easyvk::Buffer(device, 1, sizeof(uint32_t));
auto debug = easyvk::Buffer(device, 1, sizeof(uint32_t));
// Write initial values to the buffers.
for (int i = 0; i < size; i++) {
// The buffer provides an untyped view of the memory, so you must specify
// the type when using the load/store method.
in.store<uint32_t>(i, i);
}
out.clear();
prefixStates.clear();
partitionCtr.clear();
std::vector<easyvk::Buffer> bufs = {in, out, prefixStates, partitionCtr, debug};
std::vector<uint32_t> spvCode =
#include "build/prefix-scan.cinit"
;
auto program = easyvk::Program(device, spvCode, bufs);
program.setWorkgroups(numWorkgroups);
program.setWorkgroupSize(workgroupSize);
program.setWorkgroupMemoryLength(workgroupSize*sizeof(uint32_t), 0);
// Run the kernel.
program.initialize("prefix_scan");
float time = program.runWithDispatchTiming();
std::cout << "debug: " << debug.load<uint>(0) << "\n";
// Check the output.
if (checkResults) {
uint32_t ref = 0;
for (int i = 0; i < size; i++) {
ref += i;
std::cout << "out[" << i << "]: " << out.load<uint>(i) << ", ref:" << ref << "\n";
assert(out.load<uint>(i) == ref);
}
}
// time is returned in ns, so don't need to divide by bytes to get GBPS
std::cout << "GPU Time: " << time / 1000000 << " ms\n";
std::cout << "Throughput: " << (((long) size) * 4 * 2)/(time) << " GBPS\n";
// Cleanup.
program.teardown();
in.teardown();
out.teardown();
prefixStates.teardown();
partitionCtr.teardown();
device.teardown();
instance.teardown();
return 0;
}