diff --git a/nengo_ocl/clra_gemv.py b/nengo_ocl/clra_gemv.py index bc04707..bdf5a5c 100644 --- a/nengo_ocl/clra_gemv.py +++ b/nengo_ocl/clra_gemv.py @@ -1,4 +1,5 @@ from collections import defaultdict +import math import numpy as np import pyopencl as cl @@ -31,13 +32,13 @@ def flops_from_geometry(geometry, items): flops = 0 for ii in items: gi = geometry[ii] - for dotinfo in gi['dots']: + for dotinfo in gi.dots: # -- for every value of A, we # (1) mult with some x # (2) add to a resulting inner-product - flops += dotinfo['a_shape1'] * gi['y_len'] * 2 + flops += dotinfo.a_shape1 * gi.y_len * 2 # XXX Generously assuming alpha & beta in use - flops += gi['y_len'] * 3 + flops += gi.y_len * 3 return flops @@ -46,11 +47,11 @@ def bw_from_geometry(geometry, items): elemsize = 4 for ii in items: gi = geometry[ii] - for dotinfo in gi['dots']: + for dotinfo in gi.dots: # -- load A - n_bytes += elemsize * dotinfo['a_shape1'] * gi['y_len'] + n_bytes += elemsize * dotinfo.a_shape1 * gi.y_len # -- load X - n_bytes += elemsize * dotinfo['a_shape1'] + n_bytes += elemsize * dotinfo.a_shape1 # -- load alpha scalar, beta scalar # XXX: Account for a possible full vector read @@ -58,19 +59,32 @@ def bw_from_geometry(geometry, items): n_bytes += 2 * elemsize # -- load Y_in - n_bytes += elemsize * gi['y_len'] + n_bytes += elemsize * gi.y_len # -- write Y_out - n_bytes += elemsize * gi['y_len'] + n_bytes += elemsize * gi.y_len return n_bytes +class Dot(object): -class DotSignature(object): + def __init__(self, j, x_j, a_j, x_start, a_start, a_stride0, a_shape1): + self.j = j + self.x_j = x_j + self.a_j = a_j + self.x_start = x_start + self.a_start = a_start + self.a_stride0 = a_stride0 + self.a_shape1 = a_shape1 # Columns of A - def __init__(self, dct): - self.y_len = dct['y_len'] - self.Ax_dims = tuple( - [(d['a_shape1'], d['a_stride0']) for d in dct['dots']]) +class GeometryEntry(object): + + def __init__(self, y_len, y_start, y_in_start, dots): + self.y_len = y_len # = Rows of A + self.y_start = y_start + self.y_in_start = y_in_start + assert all(isinstance(dot, Dot) for dot in dots) + self.dots = dots + self.Ax_dims = [(d.a_shape1, d.a_stride0) for d in dots] def __eq__(self, other): return type(self) == type(other) \ @@ -86,9 +100,66 @@ def __str__(self): counts[dim_stride] += 1 return 'yd=%s <- %s' % ( self.y_len, - ', '.join(('(%s x d=%s,s=%s)' % (counts[(d, s)], d, s)) - for (d, s) in counts)) + ', '.join(('(%s x n=%s,stride=%s)' % (counts[(n, s)], n, s)) + for (n, s) in counts)) + + +class Geometry(object): + """The geometry of a gemv_prog.""" + + def __init__(self, A_starts, X_starts, Y_starts, Y_in_starts, A_stride0s, + A_shape1s, Y_shape0s, X_js=None, A_js=None): + self._dbbs = [] + for bb in range(len(Y_shape0s)): + dots = [] + if X_js: + x_js_i = X_js[bb] + A_js_i = A_js[bb] + assert len(x_js_i) == len(A_js_i) + for jj, (xj, aj) in enumerate(zip(x_js_i, A_js_i)): + assert xj.size == 1 and aj.size == 1 + xj, aj = xj[0], aj[0] # to ignore numpy DeprecationWarning + dots.append(Dot(jj, xj, aj, X_starts[xj], A_starts[aj], + A_stride0s[aj], A_shape1s[aj])) + self._dbbs.append(GeometryEntry(Y_shape0s[bb], + Y_starts[bb], + Y_in_starts[bb], + dots)) + + def __getitem__(self, key): + return self._dbbs[key] + + def __str__(self): + return self.summary() + + def summary(self, items=None): + if items is None: + gg = self._dbbs + else: + gg = [self._dbbs[i] for i in items] + outputs = len(gg) + dots = np.array([len(g.dots) for g in gg]) + shape0s = np.array([g.y_len for g in gg]) + shape1s = np.hstack([[d.a_shape1 for d in g.dots] for g in gg]) + return ("outputs: %d; dots: %0.1f [%d, %d]; " + "shape: %0.1f [%d, %d] x %0.1f [%d, %d]" + % (outputs, dots.mean(), dots.min(), dots.max(), + shape0s.mean(), shape0s.min(), shape0s.max(), + shape1s.mean(), shape1s.min(), shape1s.max())) + + def print_summary(self, items=None, full=False): + print('geometry_summary: tag=%s' % self.tag) + if items is None: + gg = self._dbbs + else: + gg = map(self._dbbs.__getitem__, items) + + counts = defaultdict(lambda: 0) + for dsi in gg: + counts[dsi] += 1 + for dsi, count in sorted(counts.iteritems(), key=lambda x: x[1]): + print(' %6s\t%s' % (count, dsi)) class gemv_prog(object): @@ -118,76 +189,16 @@ def __init__(self, queue, alpha, A, A_js, X, X_js, beta, Y, self.geometry = self._geometry() self.plans = self.choose_plans() - def geometry_summary(self, items=None): - if items is None: - gg = self.geometry - else: - gg = [self.geometry[i] for i in items] - - outputs = len(gg) - dots = np.array([len(g['dots']) for g in gg]) - shape0s = np.array([g['y_len'] for g in gg]) - shape1s = np.hstack([[d['a_shape1'] for d in g['dots']] for g in gg]) - return ("outputs: %d; dots: %0.1f [%d, %d]; " - "shape: %0.1f [%d, %d] x %0.1f [%d, %d]" - % (outputs, dots.mean(), dots.min(), dots.max(), - shape0s.mean(), shape0s.min(), shape0s.max(), - shape1s.mean(), shape1s.min(), shape1s.max())) - - def print_geometry_summary(self, items=None, full=False): - print('geometry_summary: tag=%s' % self.tag) - if items is None: - gg = self.geometry - else: - gg = map(self.geometry.__getitem__, items) + def _geometry(self): + return Geometry(self.A.starts, self.X.starts, self.Y.starts, + self.Y_in.starts, self.A.stride0s, self.A.shape1s, + self.Y.shape0s, self.X_js, self.A_js) - ds = map(DotSignature, gg) - counts = defaultdict(lambda: 0) - for dsi in ds: - counts[dsi] += 1 - for dsi in sorted(counts): - print(' %6s\t%s' % (counts[dsi], dsi)) + def cl_geometry_and_textconf(self, items, padding=4, stride=0): + assert isinstance(stride, int) and stride in [0,1] - def _geometry(self): - A_starts = self.A.starts - X_starts = self.X.starts - Y_starts = self.Y.starts - Y_in_starts = self.Y_in.starts - A_stride0s = self.A.stride0s - A_shape1s = self.A.shape1s - Y_shape0s = self.Y.shape0s - - rval = [] - for bb in range(len(Y_shape0s)): - dbb = { - 'y_len': Y_shape0s[bb], - 'dots': [], - 'y_start': Y_starts[bb], - 'y_in_start': Y_in_starts[bb], - } - if self.X_js: - x_js_i = self.X_js[bb] - A_js_i = self.A_js[bb] - assert len(x_js_i) == len(A_js_i) - for jj, (xj, aj) in enumerate(zip(x_js_i, A_js_i)): - assert xj.size == 1 and aj.size == 1 - xj, aj = xj[0], aj[0] # to ignore numpy DeprecationWarning - dbb['dots'].append({ - 'j': jj, - 'x_j': xj, - 'a_j': aj, - 'x_start': X_starts[xj], - 'a_start': A_starts[aj], - 'a_stride0': A_stride0s[aj], - 'a_shape1': A_shape1s[aj], - }) - rval.append(dbb) - - return rval - - def cl_geometry_and_textconf(self, items, padding=4): p = self - max_n_dots = max(len(p.geometry[ii]['dots']) for ii in items) + max_n_dots = max(len(p.geometry[ii].dots) for ii in items) n_structure_vars = 4 * max_n_dots + 5 structure_vars_stride = int( padding * np.ceil(float(n_structure_vars) / padding)) @@ -197,7 +208,7 @@ def cl_geometry_and_textconf(self, items, padding=4): X_starts = p.X.starts Y_starts = p.Y.starts Y_in_starts = p.Y_in.starts - A_stride0s = p.A.stride0s + A_strides = [p.A.stride0s, p.A.stride1s][stride] A_shape1s = p.A.shape1s Y_shape0s = p.Y.shape0s @@ -210,7 +221,7 @@ def cl_geometry_and_textconf(self, items, padding=4): xi, ai = xi[0], ai[0] # to ignore numpy DeprecationWarning gstructure[bbi, 0 * max_n_dots + ii] = X_starts[xi] gstructure[bbi, 1 * max_n_dots + ii] = A_starts[ai] - gstructure[bbi, 2 * max_n_dots + ii] = A_stride0s[ai] + gstructure[bbi, 2 * max_n_dots + ii] = A_strides[ai] gstructure[bbi, 3 * max_n_dots + ii] = A_shape1s[ai] # -- offset of output and input buffers gstructure[bbi, 4 * max_n_dots + 0] = Y_in_starts[bb] @@ -227,7 +238,7 @@ def cl_geometry_and_textconf(self, items, padding=4): 'structure_vars_stride': structure_vars_stride, 'x_starts': 'lstructure[0 * %s + ii]' % max_n_dots, 'a_starts': 'lstructure[1 * %s + ii]' % max_n_dots, - 'a_s0': 'lstructure[2 * %s + ii]' % max_n_dots, + 'a_s%d' % stride: 'lstructure[2 * %s + ii]' % max_n_dots, 'N_i': 'lstructure[3 * %s + ii]' % max_n_dots, 'y_in_starts': 'lstructure[4 * %s + 0]' % max_n_dots, 'y_offset': 'lstructure[4 * %s + 1]' % max_n_dots, @@ -260,10 +271,8 @@ def ref_impl(p, items): if 0: if len(items) < 10: print('Falling back on reference implementation') - p.print_geometry_summary(items, full=True) else: print('Falling back on reference implementation') - p.print_geometry_summary(items) assert all(s == 1 for s in p.A.stride1s) assert all(s == 1 for s in p.X.stride1s) @@ -372,7 +381,7 @@ def ref_impl(p, items): Template(text, output_encoding='ascii').render(**p.__dict__)) gsize = ( - max(p.geometry[ii]['y_len'] for ii in items), + max(p.geometry[ii].y_len for ii in items), len(items)) lsize = None fn = cl.Program(p.queue.context, text).build().gemv_ref @@ -442,35 +451,34 @@ def reduce_impl(p, items, raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() - if not all(s == 1 for s in p.A.stride1s): - raise NotImplementedError() + if not all(s == 1 for s in p.A.stride0s): + raise NotImplementedError('A must be in column major') assert p.float_alpha is not None assert p.float_gamma is not None - cl_gstructure, textconf = p.cl_geometry_and_textconf(items) - max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items]) - max_reduce_len = max(max([gg['a_shape1'] - for gg in p.geometry[ii]['dots']]) + cl_gstructure, textconf = p.cl_geometry_and_textconf(items, stride=1) + max_n_dots = max([len(p.geometry[ii].dots) for ii in items]) + max_reduce_len = max(max([gg.a_shape1 + for gg in p.geometry[ii].dots]) for ii in items) - max_y_len = max([p.geometry[ii]['y_len'] for ii in items]) + max_y_len = max([p.geometry[ii].y_len for ii in items]) # segment means the piece of Y written by a work-group # group_size is the number of values that we're reducing over - if len(items) < 4: - if group_size is None: - group_size = 32 # XXX - if segment_size is None: - segment_size = min(max_y_len, 2) # XXX - else: - if group_size is None: - group_size = 32 # XXX - if segment_size is None: - segment_size = min(max_y_len, 4) # XXX + # TODO autotune + if group_size is None: + group_size = 4 + + if segment_size is None: + if len(items) < 8: + segment_size = min(max_y_len, 8) # XXX + else: + segment_size = min(max_y_len, 32) # XXX g_segments = int(np.ceil(float(max_y_len) / segment_size)) - gsize = (group_size, g_segments * segment_size, len(items)) - lsize = (group_size, segment_size, 1) + gsize = (g_segments * segment_size, group_size, len(items)) + lsize = (segment_size, group_size, 1) max_reduce_iters = int(np.ceil(float(max_reduce_len) / group_size)) textconf.update({ @@ -481,8 +489,8 @@ def reduce_impl(p, items, 'group_size': group_size, 'local_count': group_size * segment_size, 'max_reduce_len': max_reduce_len, - 'N_cutoff': max_reduce_iters * group_size, 'max_n_dots': max_n_dots, + 'log2_group_size': int(np.ceil(np.log2(group_size))), }) if 0: for k, v in textconf.items(): @@ -501,19 +509,18 @@ def reduce_impl(p, items, const __global ${Y_in.cl_buf.ctype} *Y_in_data, __global ${Y.cl_buf.ctype} *Y_data) { + const int i = get_local_id(0); + const int j = get_local_id(1); + const int m = get_global_id(0); + __local int lstructure[${n_structure_vars}]; - % if segment_size > 1: - // we'll cache X in shared memory so we load it only once - // for the whole segment - __local ${X.cl_buf.ctype} lX[${group_size}]; - % endif + //Scratch space for the dot products __local ${Y.cl_buf.ctype} - partialDotProduct[${segment_size}][${group_size}]; + sums[${group_size}][${segment_size}]; __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}]; - const int local_idx = get_local_id(0) - + get_local_id(1) * get_local_size(0); + const int local_idx = i + j * get_local_size(0); // load structure % if local_count < n_structure_vars: @@ -524,7 +531,7 @@ def reduce_impl(p, items, lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } - % else : + % else: if (local_idx < ${n_structure_vars}) { lstructure[local_idx] = gstructure[ @@ -533,25 +540,25 @@ def reduce_impl(p, items, % endif barrier(CLK_LOCAL_MEM_FENCE); - if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) + if (j == 0 && m < ${y_len}) { % if float_beta is not None and float_beta != 0 : - y_sum_pre[get_local_id(1)] = ${float_beta} - * Y_in_data[${y_in_starts} + get_global_id(1)]; + y_sum_pre[i] = ${float_beta} + * Y_in_data[${y_in_starts} + m]; % elif cl_beta is not None: - y_sum_pre[get_local_id(1)] = betas[${bb}] - * Y_in_data[${y_in_starts} + get_global_id(1)]; + y_sum_pre[i] = betas[${bb}] + * Y_in_data[${y_in_starts} + m]; % else : - y_sum_pre[get_local_id(1)] = 0; + y_sum_pre[i] = 0; % endif % if float_gamma is not None and float_gamma != 0: - y_sum_pre[get_local_id(1)] += ${float_gamma}; + y_sum_pre[i] += ${float_gamma}; % endif - // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]); + // printf("betaY + gamma=%f\\n", y_sum_pre[i]); } - partialDotProduct[get_local_id(1)][get_local_id(0)] = 0; + sums[j][i] = 0; % if max_n_dots > 1: for (int ii = 0; ii < ${n_dot_products}; @@ -562,57 +569,48 @@ def reduce_impl(p, items, % endif - for (int nn = get_local_id(0); - nn < ${N_cutoff}; - nn += get_local_size(0)) - { - // segment_size = ${segment_size} - % if (segment_size == 1): - if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) - { - partialDotProduct[get_local_id(1)][get_local_id(0)] += - A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] - * X_data[${x_starts} + nn]; - } - % else: - barrier(CLK_LOCAL_MEM_FENCE); - if ((get_local_id(1) == 0) && (nn < ${N_i})) - { - lX[get_local_id(0)] = X_data[${x_starts} + nn]; - } - barrier(CLK_LOCAL_MEM_FENCE); - if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) + + __global ${A.cl_buf.ctype}* a = A_data + ${a_starts} + m; + + % if segment_size >= 4: + // we'll cache X in shared memory so we load it only once + // for the whole segment + __local ${X.cl_buf.ctype} lX[${max_reduce_len}]; + for (int k = local_idx; k < ${N_i}; k += ${local_count}) + lX[k] = X_data[${x_starts} + k]; + barrier(CLK_LOCAL_MEM_FENCE); + % endif + + if (m < ${y_len}) { + for (int k = get_global_id(1); k < ${N_i}; k += get_global_size(1)) { - partialDotProduct[get_local_id(1)][get_local_id(0)] += - A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] - * lX[get_local_id(0)]; + // segment_size = ${segment_size} + % if segment_size < 4: + sums[j][i] += a[${a_s1} * k] * X_data[${x_starts} + k]; + % else: + sums[j][i] += a[${a_s1} * k] * lX[k]; + % endif } - % endif } % if (max_n_dots > 1): } % endif - // -- Parallel reduction long work-group dimension 0 - for (uint stride = 1; - stride < get_local_size(0); - stride *= 2) - { + // -- Parallel reduction + % for ks in range(log2_group_size - 1, -1, -1): barrier(CLK_LOCAL_MEM_FENCE); - - uint index = 2 * stride * get_local_id(0); - if (index + stride < get_local_size(0)) - { - partialDotProduct[get_local_id(1)][index] += - partialDotProduct[get_local_id(1)][index + stride]; + if (j < ${2**ks}) { + sums[j][i] += sums[j + ${2**ks}][i]; + % if ks == 0: + if (m < ${y_len}) { + Y_data[${y_offset} + m] = y_sum_pre[i] + + ${float_alpha} * sums[0][i]; + } + % endif } - } - // barrier(CLK_LOCAL_MEM_FENCE); - if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { - Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)] - + ${float_alpha} * partialDotProduct[get_local_id(1)][0]; - } + % endfor + } """ @@ -640,7 +638,7 @@ def reduce_impl(p, items, flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args - rval.description = p.geometry_summary(items) + rval.description = p.geometry.summary(items) return rval @@ -671,7 +669,7 @@ def many_dots_impl(p, items): raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() - if not all(s == 1 for s in p.A.stride1s): + if not all(s == 1 for s in p.A.stride0s): raise NotImplementedError() assert p.float_alpha is not None @@ -681,12 +679,12 @@ def many_dots_impl(p, items): # -- easy probably, but not done raise NotImplementedError() A_js_shape0s = p.A_js.shape0s - cl_gstructure, textconf = p.cl_geometry_and_textconf(items) + cl_gstructure, textconf = p.cl_geometry_and_textconf(items, stride=1) # min_n_dots = min(A_js_shape0s) max_n_dots = max(A_js_shape0s) - max_y_len = max(p.geometry[ii]['y_len'] for ii in items) + max_y_len = max(p.geometry[ii].y_len for ii in items) MAX_SEGMENT_SIZE = 16 # tricky to tune? segment_size = min( @@ -713,13 +711,7 @@ def many_dots_impl(p, items): 'segment_idx': 'segment_idx', 'dot_block_idx': 'dot_block_idx', }) - if 0: - for k, v in textconf.items(): - print(k, v) textconf.update(p.__dict__) - # print('float_gamma', textconf['float_gamma']) - # print('cl_gamma', textconf['cl_gamma']) - # print('clra_gamma', textconf['clra_gamma']) text = """ __kernel void gemv_many_dots( @@ -778,11 +770,13 @@ def many_dots_impl(p, items): ii < ${n_dot_products}; ii += ${dot_block_size}) { - for (int nn = 0; nn < ${N_i}; nn += 1) + const __global ${A.cl_buf.ctype}* a = A_data + ${a_starts} + + get_global_id(0); + const __global ${X.cl_buf.ctype}* x = X_data + ${x_starts}; + for (int nn = 0; nn < ${N_i}; nn++) { - y_sum_post[dot_block_idx][segment_idx] - += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn] - * X_data[${x_starts} + nn]; + y_sum_post[dot_block_idx][segment_idx] += + a[${a_s1} * nn] * x[nn]; } } } @@ -825,30 +819,30 @@ def many_dots_impl(p, items): flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args - rval.description = p.geometry_summary(items) + rval.description = p.geometry.summary(items) return rval def block_impl(p, items): if p.clra_alpha is not None: - raise NotImplementedError() + raise NotImplementedError('block_impl: clra_alpha not supported') if p.clra_gamma is not None: - raise NotImplementedError() + raise NotImplementedError('block_impl: clra_gamma not supported') if p.clra_beta is not None: - raise NotImplementedError() + raise NotImplementedError('block_impl: clra_beta not supported') if p.cl_alpha is not None: - raise NotImplementedError() + raise NotImplementedError('block_impl: cl_alpha not supported') if p.cl_beta is not None: - raise NotImplementedError() + raise NotImplementedError('block_impl: cl_beta not supported') if p.cl_gamma is not None: - raise NotImplementedError() - if not all(s == 1 for s in p.A.stride1s): - raise NotImplementedError() + raise NotImplementedError('block_impl: cl_gamma not supported') + if not all(s == 1 for s in p.A.stride0s): + raise NotImplementedError('block_impl: A must be in column-major') if p.A_js is None: # -- easy probably, but not done - raise NotImplementedError() + raise NotImplementedError('block_impl: A_js cannot be None') # --- blocking # We want to group the dot products into blocks, so that each workgroup @@ -857,7 +851,7 @@ def block_impl(p, items): # region of this buffer, then reduce across the buffer in a separate kernel # block_y = 8 - block_y = 32 + block_y = 128 # block_x = 32 block_x = 128 @@ -924,7 +918,7 @@ def block_impl(p, items): bw_reduce += shape0i*(len(Ybufind_reduce) + 1) * p.Y.dtype.itemsize # --- create structure - gstructure = np.column_stack([shape0s, shape1s, Astride0s, Astride1s, + gstructure = np.column_stack([shape0s, shape1s, Astride1s, Astarts, Xstride0s, Xstarts, Ybufstarts]) cl_gstructure = to_device(p.queue, gstructure.astype(np.int32)) @@ -936,7 +930,7 @@ def block_impl(p, items): lsize0_log2 = int(np.log2(lsize0)) assert 2**lsize0_log2 == lsize0 - lsize = (lsize0, block_y, 1) + lsize = (block_y, lsize0, 1) gsize = (lsize[0], lsize[1], gstructure.shape[0]) assert np.prod(lsize) >= block_x @@ -947,12 +941,11 @@ def block_impl(p, items): n_structure_vars=gstructure.shape[1], shape0='lstructure[0]', shape1='lstructure[1]', - Astride0='lstructure[2]', - Astride1='lstructure[3]', - Astart='lstructure[4]', - Xstride0='lstructure[5]', - Xstart='lstructure[6]', - Ybufstart='lstructure[7]', + Astride1='lstructure[2]', + Astart='lstructure[3]', + Xstride0='lstructure[4]', + Xstart='lstructure[5]', + Ybufstart='lstructure[6]', block_y=block_y, block_x=block_x, lsize0=lsize0, @@ -975,9 +968,9 @@ def block_impl(p, items): __global ${Ybuf.ctype} *Ybufdata ) { - const int j = get_global_id(0); - const int i = get_global_id(1); - const int n = get_global_id(2); + const int i = get_global_id(0); // Row index + const int j = get_global_id(1); // Column group index + const int item_i = get_global_id(2); // load structure __local int lstructure[${n_structure_vars}]; @@ -985,7 +978,7 @@ def block_impl(p, items): get_local_id(0) + get_local_id(1)*get_local_size(0); if (local_idx < ${n_structure_vars}) lstructure[local_idx] = gstructure[ - n * ${n_structure_vars} + local_idx]; + item_i * ${n_structure_vars} + local_idx]; barrier(CLK_LOCAL_MEM_FENCE); __global const ${X.ctype} *x = Xdata + ${Xstart}; @@ -997,25 +990,33 @@ def block_impl(p, items): xlocal[local_idx] = x[local_idx*${Xstride0}]; barrier(CLK_LOCAL_MEM_FENCE); - __local ${Ybuf.ctype} sums[${block_y}][${lsize0}]; - sums[i][j] = 0; + __local ${Ybuf.ctype} sums[${lsize0}][${block_y}]; + sums[j][i] = 0; + + const int ncols = max(1, (int)(${shape1} / get_local_size(1))); + const int jj_max = j + 1 < get_local_size(1) ? + min((j+1) * ncols, ${shape1}) : ${shape1}; + + __global const ${A.ctype} *Ai = Adata + ${Astart} + i; if (i < ${shape0}) { - __global const ${A.ctype} *Ai = Adata + ${Astart} + i*${Astride0}; - for(int jj = j; jj < ${shape1}; jj += get_global_size(0)) { - sums[i][j] += Ai[jj*${Astride1}] * xlocal[jj]; + for(int jj = j * ncols; jj < jj_max; jj++) { + sums[j][i] += Ai[jj*${Astride1}] * xlocal[jj]; } } barrier(CLK_LOCAL_MEM_FENCE); - % for k in range(lsize0_log2 - 1, 0, -1): - if (j < ${2**k}) - sums[i][j] += sums[i][${2**k} + j]; + % for k in range(lsize0_log2 - 1, -1, -1): barrier(CLK_LOCAL_MEM_FENCE); + if (j < ${2**k}) { + sums[j][i] += sums[j + ${2**k}][i]; + % if k == 0: + if (i < ${shape0}) { + ybuf[i] = ${float_alpha} * sums[j][i]; + } + % endif + } % endfor - - if (i < ${shape0} && j == 0) - ybuf[i] = ${float_alpha} * (sums[i][0] + sums[i][1]); } """ @@ -1030,7 +1031,7 @@ def block_impl(p, items): flops_per_call=flops_from_geometry(p.geometry, items), ) plan.full_args = full_args # prevent GC the args - plan.description = p.geometry_summary(items) + plan.description = p.geometry.summary(items) plan.Ybuf = clYbuf # --- Reduce kernel @@ -1121,11 +1122,171 @@ def block_impl(p, items): name='clra_gemv.block_impl_reduce', tag=p.tag) plan_reduce.full_args = full_args_reduce # prevent GC of the args plan_reduce.bw_per_call = bw_reduce - # plan_reduce.description = p.geometry_summary(items) + # plan_reduce.description = p.geometry.summary(items) return [plan, plan_reduce] +def one_thread_per_row_impl(p, items): + """Each thread computes the dot product over one row. + The rows are grouped consecutively when y_len is small. + """ + if p.clra_alpha is not None: + raise NotImplementedError() + if p.clra_gamma is not None: + raise NotImplementedError() + if p.clra_beta is not None: + raise NotImplementedError() + if p.cl_alpha is not None: + raise NotImplementedError() + if p.cl_gamma is not None: + raise NotImplementedError() + if not all(s == 1 for s in p.A.stride0s): + raise NotImplementedError('A must be in column major') + + num_items = len(items) + assert num_items > 0 + + assert p.float_alpha is not None + assert p.float_gamma is not None + + if p.A_js is None: + # -- easy probably, but not done + raise NotImplementedError() + + cl_gstructure, textconf = p.cl_geometry_and_textconf(items, + stride=1) + + max_y_len = max(p.geometry[ii].y_len for ii in items) + max_n_dots = max(len(p.geometry[ii].dots) for ii in items) + all_same_y_len = len(set(p.geometry[ii].y_len for ii in items)) == 1 + + + # Compute in consecutive threads if max_y_len is small, + # and all y_len are the same. + # TODO autotune + if all_same_y_len and max_y_len < 64: + consecutive = True + gsize = (max_y_len * num_items,) + else: + consecutive = False + gsize = (max_y_len, num_items) + + textconf.update({ + 'max_n_dots': max_n_dots, + 'max_y_len': max_y_len, + 'consecutive': consecutive, + 'all_same_y_len': all_same_y_len, + }) + textconf.update(p.__dict__) + + """ + int i, item_i; +% for ix, sprev, s, item_i in zip(range(len(items)), num_rows, num_rows[1:], items): + % if ix == 0: + if (global_i < ${s}) + % else: + else if (global_i < ${s}) + % endif + { + item_i = ${item_i}; + i = global_i - ${sprev}; + } + % endfor + """ + + text = """ + __kernel void gemv( + const __global int* gstructure, + const __global ${A.cl_buf.ctype}* A_data, + const __global ${X.cl_buf.ctype}* X_data, + % if cl_beta is not None: + const __global ${cl_beta.ctype}* betas, + % endif + const __global ${Y_in.cl_buf.ctype}* Y_in_data, + __global ${Y.cl_buf.ctype}* Y_data) + { + % if consecutive: + const int global_i = get_global_id(0); + + const int item_i = global_i / ${max_y_len}; // Item index + const int i = global_i - item_i * ${max_y_len}; // Row index within item + % else: + const int item_i = get_global_id(1); // Item index + const int i = get_global_id(0); // Row index within item + % endif + + const __global int* lstructure = + gstructure + item_i * ${structure_vars_stride}; + + % if not all_same_y_len: + if (i >= ${y_len}) return; + % endif + + ${Y.cl_buf.ctype} sum = 0; + + % if max_n_dots > 1: + for (int ii = 0; ii < ${n_dot_products}; ii++) + { + % else: + const int ii = 0; + % endif + + const int a_s1 = ${a_s1}; + const int n_i = ${N_i}; + + const __global ${A.cl_buf.ctype}* a = A_data + ${a_starts} + i; + const __global ${X.cl_buf.ctype}* x = X_data + ${x_starts}; + + for (int j=0;j 1: + } + % endif + + % if float_gamma is not None: + const ${Y.cl_buf.ctype} gamma = ${float_gamma}; + % else: + const ${Y.cl_buf.ctype} gamma = 0; + % endif + + % if float_beta is not None and float_beta != 0 : + Y_data[${y_offset} + i] = ${float_alpha} * sum + ${float_beta} * Y_in_data[${y_in_starts} + i] + gamma; + % elif cl_beta is not None: + Y_data[${y_offset} + i] = ${float_alpha} * sum + betas[${bb}] * Y_in_data[${y_in_starts} + i] + gamma; + % else: + Y_data[${y_offset} + i] = ${float_alpha} * sum + gamma; + % endif + } + """ + + text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) + + fn = cl.Program(p.queue.context, text).build().gemv + + full_args = [ + cl_gstructure, + p.A.cl_buf, + p.X.cl_buf, + ] + ([p.cl_beta] if p.cl_beta is not None else []) + [ + p.Y_in.cl_buf, + p.Y.cl_buf, + ] + + fn.set_args(*[arr.data for arr in full_args]) + rval = Plan(p.queue, fn, gsize, None, + name='clra_gemv.one_thread_per_row_impl', + tag=p.tag, + bw_per_call=bw_from_geometry(p.geometry, items), + flops_per_call=flops_from_geometry(p.geometry, items), + ) + rval.full_args = full_args # prevent GC the args + rval.description = p.geometry.summary(items) + return rval + + + class plan_ref_gemv(gemv_prog): def choose_plans(self): return [ref_impl(self, range(len(self.Y)))] @@ -1156,9 +1317,9 @@ def choose_plans(self): long_dots = [ ii for ii in remaining_items - if len(self.geometry[ii]['dots']) <= 2 - and max([0] + [dct['a_shape1'] - for dct in self.geometry[ii]['dots']]) > 16] + if len(self.geometry[ii].dots) <= 2 + and max([0] + [dct.a_shape1 + for dct in self.geometry[ii].dots]) > 16] if long_dots: try: long_plan = reduce_impl(self, long_dots) @@ -1172,7 +1333,7 @@ def choose_plans(self): # many_dots = [ii # for ii in remaining_items - # if len(self.geometry[ii]['dots']) > 3] + # if len(self.geometry[ii].dots) > 3] many_dots = remaining_items if many_dots: try: @@ -1191,3 +1352,88 @@ def choose_plans(self): plans.append(remaining_plan) return plans + +class plan_one_thread_per_row_gemv(gemv_prog): + def choose_plans(self): + return [one_thread_per_row_impl(self, range(len(self.Y)))] + +class plan_pretuned_gemv(gemv_prog): + PLANS = (one_thread_per_row_impl, reduce_impl, many_dots_impl, block_impl) + + # Tuning grid obtained on a NVIDIA GTX 970. + # The indices correspond to functions in PLANS above. + TUNING_GRID = np.array( + [ [[ 0, 0, 0, 0, 1, 2, 0] + , [ 0, 1, 1, 1, 1, 2, 0] + , [ 1, 1, 1, 1, 1, 0, 0] + , [ 0, 1, 1, 1, 1, 1, 0] + , [ 1, 1, 1, 1, 1, 1, 1] + , [ 3, 3, 3, 1, 1, 1, 1] + , [ 3, 3, 3, 3, 1, 1, 1]] + , + [[ 1, 0, 2, 0, 1, 0, 0] + , [ 2, 0, 2, 0, 0, 0, 0] + , [ 1, 1, 1, 0, 1, 0, 0] + , [ 1, 1, 1, 1, 1, 0, 1] + , [ 1, 1, 1, 2, 1, 0, 1] + , [ 3, 3, 3, 3, 1, 1, 1] + , [ 3, 3, 3, 3, 2, 0, 2]] + , + [[ 0, 0, 0, 1, 1, 0, 0] + , [ 2, 2, 2, 0, 2, 0, 0] + , [ 1, 0, 0, 2, 1, 0, 0] + , [ 1, 0, 1, 0, 0, 0, 1] + , [ 1, 1, 1, 1, 1, 0, 1] + , [ 3, 3, 3, 3, 1, 0, 1] + , [ 3, 3, 3, 3, 2, 0, 2]] + , + [[ 1, 0, 0, 0, 0, 0, 0] + , [ 1, 2, 0, 0, 0, 0, 0] + , [ 0, 0, 0, 1, 0, 0, 0] + , [ 2, 1, 0, 0, 0, 0, 0] + , [ 1, 1, 0, 0, 0, 0, 0] + , [ 3, 3, 3, 1, 0, 3, 0] + , [ 3, 3, 3, 3, 0, 3, 0]] + , + [[ 1, 2, 0, 0, 0, 0, 0] + , [ 2, 0, 0, 1, 0, 0, 0] + , [ 1, 0, 0, 0, 0, 0, 0] + , [ 2, 0, 1, 0, 0, 0, 0] + , [ 1, 1, 1, 0, 0, 0, 0] + , [ 3, 3, 1, 0, 0, 0, 0] + , [ 3, 3, 3, 0, 0, 0, 0]] + , + [[ 0, 0, 0, 0, 0, 0, 0] + , [ 2, 0, 0, 0, 0, 0, 0] + , [ 1, 0, 0, 0, 0, 0, 0] + , [ 1, 0, 0, 0, 0, 0, 0] + , [ 1, 0, 0, 0, 0, 0, 0] + , [ 3, 3, 0, 3, 0, 0, 0] + , [ 3, 3, 0, 0, 0, 0, 0]] + , + [[ 2, 0, 0, 0, 0, 0, 0] + , [ 1, 0, 0, 0, 0, 0, 0] + , [ 0, 0, 0, 0, 0, 0, 0] + , [ 0, 0, 0, 0, 0, 0, 0] + , [ 0, 0, 0, 3, 0, 0, 0] + , [ 3, 0, 0, 0, 0, 0, 0] + , [ 3, 0, 0, 0, 0, 0, 0]]] + , dtype='uint8') + LOG_BASE = 4 + + def choose_plans(self): + items = range(len(self.Y)) + m, n, k = 1, 1, 1 + for ii in items: + m = max(m, self.geometry[ii].y_len) + n = max(n, max(d.a_shape1 for d in self.geometry[ii].dots)) + k = max(k, len(self.geometry[ii].dots)) + logm, logn, logk = [ + min(int(round(math.log(x, plan_pretuned_gemv.LOG_BASE))), + plan_pretuned_gemv.TUNING_GRID.shape[i] - 1) + for i,x in enumerate((m,n,k))] + + ind = plan_pretuned_gemv.TUNING_GRID[logm, logn, logk] + plan = plan_pretuned_gemv.PLANS[ind](self, range(len(self.Y))) + + return plan if isinstance(plan, list) else [plan] diff --git a/nengo_ocl/clra_nonlinearities.py b/nengo_ocl/clra_nonlinearities.py index 5fb0f25..311d518 100644 --- a/nengo_ocl/clra_nonlinearities.py +++ b/nengo_ocl/clra_nonlinearities.py @@ -27,7 +27,7 @@ def blockify_matrices(max_size, ras): assert len(ra) == N assert (ra.shape1s == ra0.shape1s).all() assert (ra.shape0s == ra0.shape0s).all() - assert (ra.shape1s == ra.stride0s).all(), "not contiguous" + assert ((ra.shape1s == 1) | (ra.shape0s == ra.stride1s)).all(), "not contiguous" sizes = [] inds = [] @@ -123,8 +123,7 @@ def plan_timeupdate(queue, step, time, dt): def plan_reset(queue, Y, values, tag=None): assert len(Y) == len(values) - assert (Y.stride0s == Y.shape1s).all() - assert (Y.stride1s == 1).all() + assert (Y.stride0s == 1).all() assert Y.ctype == values.ctype text = """ @@ -193,6 +192,7 @@ def plan_copy(queue, A, B, incs, tag=None): for arr in [A, B]: assert (arr.shape1s == 1).all() + assert (arr.stride0s == 1).all() assert (A.shape0s == B.shape0s).all() assert A.ctype == B.ctype @@ -203,10 +203,8 @@ def plan_copy(queue, A, B, incs, tag=None): % if inc is None: __global const int *incdata, % endif - __global const int *Astrides, __global const int *Astarts, __global const ${Atype} *Adata, - __global const int *Bstrides, __global const int *Bstarts, __global ${Btype} *Bdata, __global const int *sizes @@ -221,12 +219,12 @@ def plan_copy(queue, A, B, incs, tag=None): __global ${Btype} *b = Bdata + Bstarts[n]; % if inc is True: - b[i*Bstrides[n]] += a[i*Astrides[n]]; + b[i] += a[i]; % elif inc is False: - b[i*Bstrides[n]] = a[i*Astrides[n]]; + b[i] = a[i]; % else: - if (incdata[n]) b[i*Bstrides[n]] += a[i*Astrides[n]]; - else b[i*Bstrides[n]] = a[i*Astrides[n]]; + if (incdata[n]) b[i] += a[i]; + else b[i] = a[i]; % endif } """ @@ -246,10 +244,8 @@ def plan_copy(queue, A, B, incs, tag=None): textconf = dict(Atype=A.ctype, Btype=B.ctype, N=N, inc=None) full_args = [ - to_device(queue, A.stride0s[inds]), to_device(queue, Astarts), A.cl_buf, - to_device(queue, B.stride0s[inds]), to_device(queue, Bstarts), B.cl_buf, to_device(queue, sizes), @@ -278,10 +274,8 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None): N = len(A) assert len(A) == len(B) == len(Ainds) == len(Binds) - for arr in [A, B, Ainds, Binds]: + for arr in A, B, Ainds, Binds: assert (arr.shape1s == 1).all() - assert (arr.stride1s == 1).all() - for arr in [Ainds, Binds]: assert (arr.stride0s == 1).all() assert (Ainds.shape0s == Binds.shape0s).all() @@ -294,10 +288,8 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None): % if inc is None: __global const int *incdata, % endif - __global const int *Astride0s, __global const int *Astarts, __global const ${Atype} *Adata, - __global const int *Bstride0s, __global const int *Bstarts, __global ${Btype} *Bdata, __global const int *Isizes, @@ -316,18 +308,17 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None): __global ${Btype} *b = Bdata + Bstarts[n]; __global const int *aind = AIdata + AIstarts[n]; __global const int *bind = BIdata + BIstarts[n]; - const int Astride0 = Astride0s[n], Bstride0 = Bstride0s[n]; if (i < Isizes[n]) { % if inc is True: - b[bind[i]*Bstride0] += a[aind[i]*Astride0]; + b[bind[i]] += a[aind[i]]; % elif inc is False: - b[bind[i]*Bstride0] = a[aind[i]*Astride0]; + b[bind[i]] = a[aind[i]]; % else: if (incdata[n]) - b[bind[i]*Bstride0] += a[aind[i]*Astride0]; + b[bind[i]] += a[aind[i]]; else - b[bind[i]*Bstride0] = a[aind[i]*Astride0]; + b[bind[i]] = a[aind[i]]; % endif } } @@ -346,10 +337,8 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None): textconf = dict(Atype=A.ctype, Btype=B.ctype, N=N, inc=None) full_args = [ - to_device(queue, A.stride0s[inds]), to_device(queue, A.starts[inds]), A.cl_buf, - to_device(queue, B.stride0s[inds]), to_device(queue, B.starts[inds]), B.cl_buf, to_device(queue, sizes), @@ -385,14 +374,11 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None): assert len(Y) == N and len(A) == N for arr in [A, X, Y]: - assert (arr.stride1s == 1).all() + assert (arr.stride0s == 1).all() assert ((X.shape0s == 1) | (X.shape0s == Y.shape0s)).all() assert ((X.shape1s == 1) | (X.shape1s == Y.shape1s)).all() assert ((A.shape0s == 1) | (A.shape0s == Y.shape0s)).all() assert ((A.shape1s == 1) | (A.shape1s == Y.shape1s)).all() - assert (X.stride1s == 1).all() - assert (Y.stride1s == 1).all() - assert (A.stride1s == 1).all() assert X.ctype == Y.ctype assert A.ctype == Y.ctype @@ -400,35 +386,35 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None): text = """ inline ${Ytype} get_element( __global const ${Ytype} *data, - const int shape0, const int shape1, const int stride0, + const int shape0, const int shape1, const int stride1, const int i, const int j ) { if (shape0 == 1 && shape1 == 1) return data[0]; else if (shape0 == 1) - return data[j]; + return data[j * stride1]; else if (shape1 == 1) - return data[i * stride0]; + return data[i]; else - return data[i * stride0 + j]; + return data[i + j * stride1]; } ////////// MAIN FUNCTION ////////// __kernel void elementwise_inc( __global const int *Ashape0s, __global const int *Ashape1s, - __global const int *Astride0s, + __global const int *Astride1s, __global const int *Astarts, __global const ${Atype} *Adata, __global const int *Xshape0s, __global const int *Xshape1s, - __global const int *Xstride0s, + __global const int *Xstride1s, __global const int *Xstarts, __global const ${Xtype} *Xdata, __global const int *Yshape0s, __global const int *Yshape1s, - __global const int *Ystride0s, + __global const int *Ystride1s, __global const int *Ystarts, __global ${Ytype} *Ydata ) @@ -440,24 +426,24 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None): const int Ashape0 = Ashape0s[n]; const int Ashape1 = Ashape1s[n]; - const int Astride0 = Astride0s[n]; + const int Astride1 = Astride1s[n]; const int Xshape0 = Xshape0s[n]; const int Xshape1 = Xshape1s[n]; - const int Xstride0 = Xstride0s[n]; - const int Yshape1 = Yshape1s[n]; - const int Ystride0 = Ystride0s[n]; - const int Ysize = Yshape0s[n] * Yshape1; + const int Xstride1 = Xstride1s[n]; + const int Yshape0 = Yshape0s[n]; + const int Ystride1 = Ystride1s[n]; + const int Ysize = Yshape1s[n] * Yshape0; for (int ij = get_global_id(0); ij < Ysize; ij += get_global_size(0)) { - int i = ij / Yshape1; - int j = ij % Yshape1; + const int i = ij % Yshape0; + const int j = ij / Yshape0; - ${Atype} aa = get_element(a, Ashape0, Ashape1, Astride0, i, j); - ${Xtype} xx = get_element(x, Xshape0, Xshape1, Xstride0, i, j); - y[i * Ystride0 + j] += aa * xx; + const ${Atype} aa = get_element(a, Ashape0, Ashape1, Astride1, i, j); + const ${Xtype} xx = get_element(x, Xshape0, Xshape1, Xstride1, i, j); + y[i + j * Ystride1] += aa * xx; } } """ @@ -468,17 +454,17 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None): full_args = ( A.cl_shape0s, A.cl_shape1s, - A.cl_stride0s, + A.cl_stride1s, A.cl_starts, A.cl_buf, X.cl_shape0s, X.cl_shape1s, - X.cl_stride0s, + X.cl_stride1s, X.cl_starts, X.cl_buf, Y.cl_shape0s, Y.cl_shape1s, - Y.cl_stride0s, + Y.cl_stride1s, Y.cl_starts, Y.cl_buf, ) @@ -510,8 +496,7 @@ def plan_linearfilter(queue, X, Y, A, B, Xbuf, Ybuf, tag=None): assert len(Y) == N and len(A) == N and len(B) == N for arr in [X, Y, A, B, Xbuf, Ybuf]: - assert (arr.shape1s == arr.stride0s).all() - assert (arr.stride1s == 1).all() + assert (arr.stride0s == 1).all() for arr in [X, Y, A, B]: # vectors assert (arr.shape1s == 1).all() assert (X.shape0s == Y.shape0s).all() @@ -672,14 +657,11 @@ def plan_probes(queue, periods, X, Y, tag=None): N = len(X) # N.B. X[i].shape = (M, N) - # Y[i].shape = (buf_len, M * N) + # Y[i].shape = (M * N, buflen) for arr in [X, Y]: - assert (arr.stride1s == 1).all() - assert (X.shape0s * X.shape1s == Y.shape1s).all() - assert (X.stride0s == X.shape1s).all() - assert (X.stride1s == 1).all() - assert (Y.stride0s == Y.shape1s).all() - assert (Y.stride1s == 1).all() + assert (arr.stride0s == 1).all() + assert (X.shape0s * X.shape1s == Y.shape0s).all() + assert (Y.stride1s == Y.shape0s).all() periods = np.asarray(periods, dtype='float32') cl_periods = to_device(queue, periods) @@ -782,7 +764,7 @@ def plan_direct(queue, code, init, input_names, inputs, output, tag=None): for x in inputs: assert len(x) == len(output) for x in inputs + [output]: - assert (x.shape1s == 1).all() and (x.stride1s == 1).all() + assert (x.shape1s == 1).all() and (x.stride1s == x.shape0s).all() assert (x.stride0s == 1).all() input_types = [x.ctype for x in inputs] @@ -843,7 +825,7 @@ def plan_direct(queue, code, init, input_names, inputs, output, tag=None): def plan_lif(queue, dt, J, V, W, outS, ref, tau, N=None, tau_n=None, - inc_n=None, upsample=1, **kwargs): + inc_n=None, upsample=1, fastlif=False, **kwargs): adaptive = N is not None assert J.ctype == 'float' for array in [V, W, outS]: @@ -864,37 +846,62 @@ def plan_lif(queue, dt, J, V, W, outS, ref, tau, N=None, tau_n=None, dtu=dt/upsample, dtu_inv=upsample/dt, dt_inv=1/dt) decs = """ char spiked; - ${type} dV, overshoot; + ${type} dV; const ${type} dtu = ${dtu}, dtu_inv = ${dtu_inv}, dt_inv = ${dt_inv}; % if adaptive: const ${type} dt = ${dt}; % endif const ${type} V_threshold = 1; +%if fastlif: + const ${type} delta_t = dtu; +%else: + ${type} delta_t; +%endif """ # TODO: could precompute -expm1(-dtu / tau) text = """ spiked = 0; % for ii in range(upsample): + W -= dtu; +% if not fastlif: + if (W > 0) + delta_t = max((${type})0, dtu - W); + else + delta_t = dtu; +% endif % if adaptive: - dV = -expm1(-dtu / tau) * (J - N - V); + dV = -expm1(-delta_t / tau) * (J - N - V); % else: - dV = -expm1(-dtu / tau) * (J - V); + dV = -expm1(-delta_t / tau) * (J - V); % endif V += dV; - W -= dtu; +% if fastlif: if (V < 0 || W > dtu) V = 0; else if (W >= 0) V *= 1 - W * dtu_inv; +% endif if (V > V_threshold) { - overshoot = dtu * (V - V_threshold) / dV; +% if fastlif: + const ${type} overshoot = dtu * (V - V_threshold) / dV; W = ref - overshoot + dtu; +% else: + const ${type} t_spike = dtu + tau * log1p( + -(V - V_threshold) / (J - V_threshold)); + W = ref + t_spike; +% endif V = 0; spiked = 1; } +% if not fastlif: + else if (V < 0) { + V = 0; + } +% endif + % endfor outV = V; outW = W; @@ -997,8 +1004,8 @@ def _plan_template(queue, name, core_text, declares="", tag=None, assert vname not in avars, "Name clash" assert len(v) == len(input0) assert (v.shape0s == input0.shape0s).all() - assert (v.stride0s == v.shape1s).all() # rows contiguous - assert (v.stride1s == 1).all() # columns contiguous + assert (v.stride0s == 1).all() # rows contiguous + assert (v.stride1s == v.shape0s).all() # columns contiguous assert (v.shape1s == 1).all() # vectors only offset = '%(name)s_starts[gind1]' % {'name': vname} @@ -1231,7 +1238,7 @@ def plan_whitenoise(queue, Y, dist_enums, dist_params, scale, inc, dt, rngs, for arr in [Y, dist_enums, dist_params]: assert (arr.shape1s == 1).all() assert (arr.stride0s == 1).all() - assert (arr.stride1s == 1).all() + assert (arr.stride1s == arr.shape0s).all() assert (dist_enums.shape0s == 1).all() diff --git a/nengo_ocl/clraggedarray.py b/nengo_ocl/clraggedarray.py index 265be04..d7ab459 100644 --- a/nengo_ocl/clraggedarray.py +++ b/nengo_ocl/clraggedarray.py @@ -73,6 +73,7 @@ def __init__(self, queue, np_raggedarray): self.stride1s = np_raggedarray.stride1s self.buf = np_raggedarray.buf self.names = np_raggedarray.names + self.order = np_raggedarray.order @classmethod def from_arrays(cls, queue, arrays, names=None, dtype=None, align=False): @@ -248,9 +249,11 @@ def __setitem__(self, item, new_value): # contiguous clarray = self.getitem_device(item) if isinstance(new_value, np.ndarray): - array = np.asarray(new_value, order='C', dtype=self.dtype) + array = np.asarray(new_value, dtype=self.dtype, + order=self.order) else: - array = np.zeros(clarray.shape, dtype=clarray.dtype) + array = np.zeros(clarray.shape, dtype=clarray.dtype, + order=self.order) array[...] = new_value array.shape = clarray.shape # reshape to avoid warning diff --git a/nengo_ocl/raggedarray.py b/nengo_ocl/raggedarray.py index e06e1aa..161fa87 100644 --- a/nengo_ocl/raggedarray.py +++ b/nengo_ocl/raggedarray.py @@ -33,8 +33,10 @@ class RaggedArray(object): in the same underlying buffer. """ - def __init__(self, arrays, names=None, dtype=None, align=False): - arrays = [np.asarray(a) for a in arrays] + def __init__(self, arrays, names=None, dtype=None, align=False, order='F'): + assert order in ('C','F') + + arrays = [np.asarray(a, order=order) for a in arrays] assert len(arrays) > 0 assert all(a.ndim <= 2 for a in arrays) @@ -55,16 +57,22 @@ def __init__(self, arrays, names=None, dtype=None, align=False): self.starts = starts self.shape0s = [a.shape[0] if a.ndim > 0 else 1 for a in arrays] self.shape1s = [a.shape[1] if a.ndim > 1 else 1 for a in arrays] - self.stride0s = [a.shape[1] if a.ndim == 2 else 1 for a in arrays] - self.stride1s = [1 for a in arrays] + self.stride0s = [a.strides[0] // a.itemsize + if a.ndim > 0 else 1 for a in arrays] + self.stride1s = [a.strides[1] // a.itemsize + if a.ndim > 1 else + (max(a.shape[0], 1) if a.ndim > 0 and order == 'F' else 1) + for a in arrays] buf = np.zeros(starts[-1] + arrays[-1].size, dtype=dtype) for a, s in zip(arrays, starts): - buf[s:s+a.size] = a.ravel() + buf[s:s+a.size] = a.ravel(order=order) self.buf = buf self.names = names + self.order = order + self._sizes = None @classmethod diff --git a/nengo_ocl/simulator.py b/nengo_ocl/simulator.py index dd83935..49a99f6 100644 --- a/nengo_ocl/simulator.py +++ b/nengo_ocl/simulator.py @@ -24,7 +24,7 @@ from nengo_ocl.raggedarray import RaggedArray from nengo_ocl.clraggedarray import CLRaggedArray, to_device -from nengo_ocl.clra_gemv import plan_block_gemv +from nengo_ocl.clra_gemv import plan_pretuned_gemv, plan_one_thread_per_row_gemv from nengo_ocl.clra_nonlinearities import ( plan_timeupdate, plan_reset, plan_copy, plan_slicedcopy, plan_direct, plan_lif, plan_lif_rate, @@ -209,10 +209,11 @@ def get_views(self): class ViewBuilder(object): - def __init__(self, bases, rarray): + def __init__(self, bases, rarray, order='F'): self.sidx = {bb: ii for ii, bb in enumerate(bases)} assert len(bases) == len(self.sidx) self.rarray = rarray + self.order = order self.starts = [] self.shape0s = [] @@ -238,8 +239,12 @@ def append_view(self, obj): self.starts.append(self.rarray.starts[idx] + obj.elemoffset) self.shape0s.append(obj.shape[0] if obj.ndim > 0 else 1) self.shape1s.append(obj.shape[1] if obj.ndim > 1 else 1) - self.stride0s.append(obj.elemstrides[0] if obj.ndim > 0 else 1) - self.stride1s.append(obj.elemstrides[1] if obj.ndim > 1 else 1) + if self.order == 'F': + self.stride0s.append(1) + self.stride1s.append(obj.shape[0] if obj.ndim > 0 else 1) + else: + self.stride0s.append(obj.elemstrides[0] if obj.ndim > 0 else 1) + self.stride1s.append(obj.elemstrides[1] if obj.ndim > 1 else 1) self.names.append(getattr(obj, 'name', '')) self.sidx[obj] = len(self.sidx) @@ -466,7 +471,7 @@ def _sig_gemv(self, ops, A_js_fn, X_js_fn, Y_fn, Y_in_fn=None, if callable(beta): beta = RaggedArray([sidx[beta(o)] for o in ops], dtype=np.float32) - rval = plan_block_gemv( + rval = plan_one_thread_per_row_gemv( self.queue, alpha, all_data, A_js, all_data, X_js, beta, Y, Y_in=Y_in, gamma=gamma, tag=tag) return rval.plans @@ -850,7 +855,7 @@ def plan_probes(self): X = self.all_data[ [self.sidx[self.model.sig[p]['in']] for p in probes]] Y = self.RaggedArray( - [np.zeros((n_prealloc, self.model.sig[p]['in'].size)) + [np.zeros((self.model.sig[p]['in'].size, n_prealloc), order='F') for p in probes], dtype=np.float32) cl_plan = plan_probes(self.queue, periods, X, Y) @@ -875,8 +880,10 @@ def _probe(self): # XXX: this syntax retrieves *ALL* of Y from the device # because the :n_buffered only works on the ndarray # *after* it has been transferred. - raw = plan.Y[i][:n_buffered] - shaped = raw.reshape((n_buffered,) + shape) + raw = plan.Y[i][:,:n_buffered] + shaped = raw.reshape(shape + (n_buffered,), order='F') + # Transpose to (n_buffered, shape). + shaped = shaped.transpose([len(shape)] + list(range(len(shape)))) self._probe_outputs[probe].extend(shaped) plan.cl_bufpositions.fill(0) self.queue.finish() @@ -908,7 +915,7 @@ def run_steps(self, N, progress_bar=True): N -= B progress.step(n=B) - if self.profiling > 1: + if self.profiling: self.print_profiling() def reset(self, seed=None): @@ -1043,11 +1050,11 @@ def print_profiling(self, sort=None): print('%8s|%10s|%10s|%10s|' % ('n_calls', 'runtime', 'GF/s', 'GB/s')) for r in table: - print('%8d|%10.3f|%10.3f|%10.3f| %s' % r) + print('%8d|%10.5f|%10.3f|%10.3f| %s' % r) # totals totals print('-' * 80) - col = lambda c: np.asarray(map(lambda x: x[c], table)) + col = lambda c: np.asarray([x[c] for x in table]) times = col(1) def wmean(x): diff --git a/nengo_ocl/tests/test_clra_gemv.py b/nengo_ocl/tests/test_clra_gemv.py index e8c2ab2..20eb533 100644 --- a/nengo_ocl/tests/test_clra_gemv.py +++ b/nengo_ocl/tests/test_clra_gemv.py @@ -11,7 +11,7 @@ from nengo_ocl.clraggedarray import CLRaggedArray as CLRA from nengo_ocl.clra_gemv import ( plan_reduce_gemv, plan_many_dots_gemv, plan_block_gemv, - plan_ragged_gather_gemv) + plan_ragged_gather_gemv, plan_one_thread_per_row_gemv) ctx = cl.create_some_context() @@ -24,7 +24,7 @@ def pytest_generate_tests(metafunc): metafunc.parametrize( "planner", [ plan_reduce_gemv, plan_many_dots_gemv, plan_block_gemv, - plan_ragged_gather_gemv]) + plan_ragged_gather_gemv, plan_one_thread_per_row_gemv]) def allclose(raA, raB): @@ -285,20 +285,35 @@ def test_speed(rng): A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) - # -- run cl computation - prog = plan_ragged_gather_gemv( - queue, alpha, clA, A_js, clX, X_js, beta, clY) - plans = prog.choose_plans() + # For bandwidth and load computations + total_mem_gb = (sum(m*n + m + n + m for m,n in zip(ms,ns)) + 3) * np.dtype('float32').itemsize * 1e-9 + total_gflops = sum(m* (2*n-1) + 4*m for m,n in zip(ms,ns)) * 1e-9 - print('') - print('-' * 5 + ' Plans ' + '-' * 45) - for plan in plans: - print(plan) + # -- run cl computation a few times + ntrials = 5 + total_time = 0.0 + for i in range(ntrials): + prog = plan_ragged_gather_gemv( + queue, alpha, clA, A_js, clX, X_js, beta, clY) + plans = prog.choose_plans() - with Timer() as timer: - for plan in plans: - plan() - print("nengo_ocl: %0.3f" % timer.duration) + if i == 0: + print('') + print('-' * 5 + ' Plans ' + '-' * 45) + for plan in plans: + print(plan) + + with Timer() as timer: + for plan in plans: + plan() + + total_time += timer.duration + + t = total_time / ntrials + print("nengo_ocl: %0.3f s, bandwidth: %.1f GB/s, load: %.1f GFlops" + % (t, + total_mem_gb / t, + total_gflops / t)) # -- speed test in ocl blas if pyopencl_blas: @@ -318,23 +333,31 @@ def array(a): # for _ in range(k)] queue.finish() - with Timer() as timer: - if 0: - # use a single queue - for A, X, Y in zip(clAs, clXs, clYs): - pyopencl_blas.gemv(queue, A, X, Y) - queue.finish() - else: - # use multiple parallel queues - events = [] - for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): - q = queues[i % len(queues)] - e = pyopencl_blas.gemv(q, A, X, Y) - events.append(e) - for q in queues: - q.flush() - cl.wait_for_events(events) - print("clBLAS: %0.3f" % timer.duration) + + total_time = 0.0 + for i in range(ntrials): + with Timer() as timer: + if 0: + # use a single queue + for A, X, Y in zip(clAs, clXs, clYs): + pyopencl_blas.gemv(queue, A, X, Y) + queue.finish() + else: + # use multiple parallel queues + events = [] + for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): + q = queues[i % len(queues)] + e = pyopencl_blas.gemv(q, A, X, Y) + events.append(e) + for q in queues: + q.flush() + cl.wait_for_events(events) + total_time += timer.duration + t = total_time / ntrials + print("clBLAS: %0.3f, bandwidth: %.1f GB/s, load: %.1f GFlops" + % (t, + total_mem_gb / t, + total_gflops / t)) if __name__ == '__main__':