-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathclkernel.m
More file actions
267 lines (252 loc) · 10.3 KB
/
clkernel.m
File metadata and controls
267 lines (252 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
% clkernel is a class that represents an OpenCL kernel object.
% It is used to create a function out of the CL kernels that were compiled
% and sent to the GPGPU device using the opencl.addfile and opencl.build
% functions.
%
% An example usage:
%
% ocl = opencl();
% ocl.initialize();
%
% ocl.addfile('cl/simple_add.cl');
% ocl.build();
%
% % Create some data objects:
%
% x = clobject(single(1:10));
% y = clobject(single(11:20));
% z = clobject(zeros(1,10, 'single'));
%
% % z = x+y:
% global_work_size = [10,0,0];
% local_work_size = [10,0,0];
%
% addkernel = clkernel('add', global_work_size, local_work_size);
% addkernel(x,y,z, uint32(10));
%
% % Fetch z values:
% values = z.get();
%
% See clkernel/clkernel
% clkernel/subsref
% clkernel/execute
%
% Author:Radford Ray Juang
%
% Copyright (C) 2011 by Radford Ray Juang
%
% Permission is hereby granted, free of charge, to any person obtaining a copy
% of this software and associated documentation files (the "Software"), to deal
% in the Software without restriction, including without limitation the rights
% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
% copies of the Software, and to permit persons to whom the Software is
% furnished to do so, subject to the following conditions:
%
% The above copyright notice and this permission notice shall be included in
% all copies or substantial portions of the Software.
%
% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
% THE SOFTWARE.
classdef clkernel < handle
properties
device = 1
id = [];
end
methods
function self = clkernel(kernelname, global_dim, local_dim, target_device)
% obj = clkernel(kernel_name)
% obj = clkernel(kernel_name, global_work_size)
% obj = clkernel(kernel_name, global_work_size, local_work_size)
% obj = clkernel(kernel_name, global_work_size, local_work_size,
% target_device)
%
% Creates a kernel object that represents the compiled kernel
% specified by kernel_name. This is the actual __kernel function
% defined in the cl source files added using opencl.addfile and
% opencl.build.
%
% kernel_name is a string containing the kernel function name in the
% CL file.
%
% global_work_size is the number of global compute units you want to
% use. This is a 1x3 array containing the number of compute units in
% each direction. For example, if you have 256 processors, you can
% have the processors divide up the workload into 4x4x16 blocks:
% global_work_size = [4,4,16]
% Or, you can divide the data into 16x16 blocks: %
% global_work_size = [16,16,0]
% Or you can just divide the data into 256 blocks:
% global_work_size = [256,0,0]
%
% If this is unspecified, the default is to divide into 128 blocks
%
% local_work_size specifies the number of local work groups to
% divide each global compute unit into. Think of this as like threads
% on a processor. The global_work_size specifies the number of
% processors to spread the division of labor over, whereas the
% local_work_size specifies the number of threads to divide the
% work for each processor. Threads can communicate and share memory
% with one another where as global compute blocks cannot, and it is
% possible for multiple threads within each global compute unit to
% execute at a time.
%
% Again, if this is unspecified, the default is 128.
%
% target_device is the index of the device to execute the kernel on.
% If you initialized one device, you can safely ignore this parameter.
% However, if you've initialized more than one device, e.g.:
% ocl=opencl();
% ocl.initialize(1,[1,3,5]);
%
% then target_device=1 will execute the kernel on device 1
% target_device=2 will execute the kernel on device 3
% target_device=3 will execute the kernel on device 5
%
% If unspecified, the first device index is used.
% NOTE: Use of multiple target devices has not been tested.
%
% Once a kernel has been created with say:
% addkernel = clkernel('add', global_work_size, local_work_size);
%
% One can just execute the kernel by using the defined kernel as a
% function. For example,
%
% addkernel(buffA, buffB, buffC, uint32(10));
%
% Constants must be casted to the correct type that the kernel
% requires. Non-constant variables must be clbuffer or clobject
% instances.
%
% NOTE: kernel execution is non-blocking. So, the function will
% return regardless of if kernel execution is completed.
%
if nargin < 2,
global_dim = [];
end
if nargin < 3,
local_dim = [];
end
if nargin < 4,
target_device = [];
end
if isempty(target_device),
target_device = 1;
end
% Automatically pick a size (this is a bad idea in general)
if isempty(global_dim),
global_dim = [128, 0,0];
end
if isempty(local_dim),
local_dim = [128,0,0];
end
self.device = target_device;
self.id = openclcmd('create_kernel', uint32(local_dim), uint32(global_dim), kernelname);
end
function value = subsref(self, S)
% Overrides matlab ( ) functionality and passes the call to the
% execute function. For example, if a kernel is created as
% follows:
% f = clkernel(kernelname, global_dims, local_dims);
%
% Place the execution of the kernel on the device queue by :
% f(arg1, arg2, arg3);
%
% And to ensure the execution is complete, make sure you call
% opencl.wait. Example:
%
% ocl = opencl();
% ocl.initialize(1,1);
%
% ...
%
% ocl.wait();
%
index = S(1);
if strcmp(index.type, '.'),
% Function call or variable name?
if ismethod(self, index.subs),
% Function call
if numel(S) > 1 && strcmp(S(2).type, '()'),
args = S(2);
value = feval(index.subs, self, args.subs{:});
S(1:2) = [];
else
value = feval(index.subs, self);
S(1) = [];
end
else
% Variable
S(1) = [];
value = self.(index.subs);
end
else
self.execute(index.subs{:});
value = [];
S = S(2:end);
end
if ~isempty(S),
value = subsref(value, S);
end
end
function execute(self, varargin)
% obj.execute(arg1, arg2, ...)
%
% Place the execution of the kernel on the device queue
% with the provided arguments.
% arg1, ... are arguments to the kernel. Constant arguments must
% be cast to the correct variable type before being passed.
%
% Non-constant arguments must be of type clbuffer or clobject
%
for i=1:numel(varargin)
argnum = i-1;
argval = varargin{i};
% Is the argument a clbuffer?
S = whos('argval');
kernelid = self.id;
bufferid = -1;
data = [];
nbytes = 0;
if isa(argval, 'clbuffer'),
% It can be a buffer with actual data or buffer that is
% empty.
%
bufferid = argval.id;
if bufferid < 0,
%Local variable type:
nbytes = argval.num_bytes;
end
elseif isa(argval, 'clobject'),
bufferid = argval.buffer.id;
if bufferid < 0,
%Local variable type:
nbytes = argval.num_bytes;
end
elseif strcmp(S.class, 'double') || ...
strcmp(S.class, 'single') || ...
strcmp(S.class, 'char') || ...
strcmp(S.class, 'uint8') || ...
strcmp(S.class, 'uint16') || ...
strcmp(S.class, 'uint32') || ...
strcmp(S.class, 'uint64') || ...
strcmp(S.class, 'int8') || ...
strcmp(S.class, 'int16') || ...
strcmp(S.class, 'int32') || ...
strcmp(S.class, 'logical')
data = argval;
else
error('Invalid type');
end
openclcmd('set_kernel_args', kernelid, argnum, bufferid, data, int32(nbytes));
%fprintf(1, 'set_kernel_args: kernelid = %d, argnum = %d, buffer=%d, data=%g, sz=%d\n', ...
% kernelid, argnum, bufferid, data, nbytes);
end % for i
openclcmd('execute_kernel', self.device-1, self.id);
end
end
end