diff --git a/nengo_ocl/clra_gemv.py b/nengo_ocl/clra_gemv.py
index bc04707..bdf5a5c 100644
--- a/nengo_ocl/clra_gemv.py
+++ b/nengo_ocl/clra_gemv.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+import math
 
 import numpy as np
 import pyopencl as cl
@@ -31,13 +32,13 @@ def flops_from_geometry(geometry, items):
     flops = 0
     for ii in items:
         gi = geometry[ii]
-        for dotinfo in gi['dots']:
+        for dotinfo in gi.dots:
             # -- for every value of A, we
             #    (1) mult with some x
             #    (2) add to a resulting inner-product
-            flops += dotinfo['a_shape1'] * gi['y_len'] * 2
+            flops += dotinfo.a_shape1 * gi.y_len * 2
         # XXX Generously assuming alpha & beta in use
-        flops += gi['y_len'] * 3
+        flops += gi.y_len * 3
     return flops
 
 
@@ -46,11 +47,11 @@ def bw_from_geometry(geometry, items):
     elemsize = 4
     for ii in items:
         gi = geometry[ii]
-        for dotinfo in gi['dots']:
+        for dotinfo in gi.dots:
             # -- load A
-            n_bytes += elemsize * dotinfo['a_shape1'] * gi['y_len']
+            n_bytes += elemsize * dotinfo.a_shape1 * gi.y_len
             # -- load X
-            n_bytes += elemsize * dotinfo['a_shape1']
+            n_bytes += elemsize * dotinfo.a_shape1
 
         # -- load alpha scalar, beta scalar
         #    XXX: Account for a possible full vector read
@@ -58,19 +59,32 @@ def bw_from_geometry(geometry, items):
         n_bytes += 2 * elemsize
 
         # -- load Y_in
-        n_bytes += elemsize * gi['y_len']
+        n_bytes += elemsize * gi.y_len
 
         # -- write Y_out
-        n_bytes += elemsize * gi['y_len']
+        n_bytes += elemsize * gi.y_len
     return n_bytes
 
+class Dot(object):
 
-class DotSignature(object):
+    def __init__(self, j, x_j, a_j, x_start, a_start, a_stride0, a_shape1):
+        self.j = j
+        self.x_j = x_j
+        self.a_j = a_j
+        self.x_start = x_start
+        self.a_start = a_start
+        self.a_stride0 = a_stride0
+        self.a_shape1 = a_shape1   # Columns of A
 
-    def __init__(self, dct):
-        self.y_len = dct['y_len']
-        self.Ax_dims = tuple(
-            [(d['a_shape1'], d['a_stride0']) for d in dct['dots']])
+class GeometryEntry(object):
+
+    def __init__(self, y_len, y_start, y_in_start, dots):
+        self.y_len = y_len         # = Rows of A
+        self.y_start = y_start
+        self.y_in_start = y_in_start
+        assert all(isinstance(dot, Dot) for dot in dots)
+        self.dots = dots
+        self.Ax_dims = [(d.a_shape1, d.a_stride0) for d in dots]
 
     def __eq__(self, other):
         return type(self) == type(other) \
@@ -86,9 +100,66 @@ def __str__(self):
             counts[dim_stride] += 1
         return 'yd=%s <- %s' % (
             self.y_len,
-            ', '.join(('(%s x d=%s,s=%s)' % (counts[(d, s)], d, s))
-                      for (d, s) in counts))
+            ', '.join(('(%s x n=%s,stride=%s)' % (counts[(n, s)], n, s))
+                      for (n, s) in counts))
+
+
+class Geometry(object):
+    """The geometry of a gemv_prog."""
+
+    def __init__(self, A_starts, X_starts, Y_starts, Y_in_starts, A_stride0s,
+                 A_shape1s, Y_shape0s, X_js=None, A_js=None):
+        self._dbbs = []
+        for bb in range(len(Y_shape0s)):
+            dots = []
+            if X_js:
+                x_js_i = X_js[bb]
+                A_js_i = A_js[bb]
+                assert len(x_js_i) == len(A_js_i)
+                for jj, (xj, aj) in enumerate(zip(x_js_i, A_js_i)):
+                    assert xj.size == 1 and aj.size == 1
+                    xj, aj = xj[0], aj[0]  # to ignore numpy DeprecationWarning
+                    dots.append(Dot(jj, xj, aj, X_starts[xj], A_starts[aj],
+                                    A_stride0s[aj], A_shape1s[aj]))
+            self._dbbs.append(GeometryEntry(Y_shape0s[bb],
+                                            Y_starts[bb],
+                                            Y_in_starts[bb],
+                                            dots))
+
+    def __getitem__(self, key):
+        return self._dbbs[key]
+
+    def __str__(self):
+        return self.summary()
+
+    def summary(self, items=None):
+        if items is None:
+            gg = self._dbbs
+        else:
+            gg = [self._dbbs[i] for i in items]
 
+        outputs = len(gg)
+        dots = np.array([len(g.dots) for g in gg])
+        shape0s = np.array([g.y_len for g in gg])
+        shape1s = np.hstack([[d.a_shape1 for d in g.dots] for g in gg])
+        return ("outputs: %d; dots: %0.1f [%d, %d]; "
+                "shape: %0.1f [%d, %d] x %0.1f [%d, %d]"
+                % (outputs, dots.mean(), dots.min(), dots.max(),
+                   shape0s.mean(), shape0s.min(), shape0s.max(),
+                   shape1s.mean(), shape1s.min(), shape1s.max()))
+
+    def print_summary(self, items=None, full=False):
+        print('geometry_summary: tag=%s' % self.tag)
+        if items is None:
+            gg = self._dbbs
+        else:
+            gg = map(self._dbbs.__getitem__, items)
+
+        counts = defaultdict(lambda: 0)
+        for dsi in gg:
+            counts[dsi] += 1
+        for dsi, count in sorted(counts.iteritems(), key=lambda x: x[1]):
+            print('  %6s\t%s' % (count, dsi))
 
 class gemv_prog(object):
 
@@ -118,76 +189,16 @@ def __init__(self, queue, alpha, A, A_js, X, X_js, beta, Y,
         self.geometry = self._geometry()
         self.plans = self.choose_plans()
 
-    def geometry_summary(self, items=None):
-        if items is None:
-            gg = self.geometry
-        else:
-            gg = [self.geometry[i] for i in items]
-
-        outputs = len(gg)
-        dots = np.array([len(g['dots']) for g in gg])
-        shape0s = np.array([g['y_len'] for g in gg])
-        shape1s = np.hstack([[d['a_shape1'] for d in g['dots']] for g in gg])
-        return ("outputs: %d; dots: %0.1f [%d, %d]; "
-                "shape: %0.1f [%d, %d] x %0.1f [%d, %d]"
-                % (outputs, dots.mean(), dots.min(), dots.max(),
-                   shape0s.mean(), shape0s.min(), shape0s.max(),
-                   shape1s.mean(), shape1s.min(), shape1s.max()))
-
-    def print_geometry_summary(self, items=None, full=False):
-        print('geometry_summary: tag=%s' % self.tag)
-        if items is None:
-            gg = self.geometry
-        else:
-            gg = map(self.geometry.__getitem__, items)
+    def _geometry(self):
+        return Geometry(self.A.starts, self.X.starts, self.Y.starts,
+                        self.Y_in.starts, self.A.stride0s, self.A.shape1s,
+                        self.Y.shape0s, self.X_js, self.A_js)
 
-        ds = map(DotSignature, gg)
-        counts = defaultdict(lambda: 0)
-        for dsi in ds:
-            counts[dsi] += 1
-        for dsi in sorted(counts):
-            print('  %6s\t%s' % (counts[dsi], dsi))
+    def cl_geometry_and_textconf(self, items, padding=4, stride=0):
+        assert isinstance(stride, int) and stride in [0,1]
 
-    def _geometry(self):
-        A_starts = self.A.starts
-        X_starts = self.X.starts
-        Y_starts = self.Y.starts
-        Y_in_starts = self.Y_in.starts
-        A_stride0s = self.A.stride0s
-        A_shape1s = self.A.shape1s
-        Y_shape0s = self.Y.shape0s
-
-        rval = []
-        for bb in range(len(Y_shape0s)):
-            dbb = {
-                'y_len': Y_shape0s[bb],
-                'dots': [],
-                'y_start': Y_starts[bb],
-                'y_in_start': Y_in_starts[bb],
-            }
-            if self.X_js:
-                x_js_i = self.X_js[bb]
-                A_js_i = self.A_js[bb]
-                assert len(x_js_i) == len(A_js_i)
-                for jj, (xj, aj) in enumerate(zip(x_js_i, A_js_i)):
-                    assert xj.size == 1 and aj.size == 1
-                    xj, aj = xj[0], aj[0]  # to ignore numpy DeprecationWarning
-                    dbb['dots'].append({
-                        'j': jj,
-                        'x_j': xj,
-                        'a_j': aj,
-                        'x_start': X_starts[xj],
-                        'a_start': A_starts[aj],
-                        'a_stride0': A_stride0s[aj],
-                        'a_shape1': A_shape1s[aj],
-                    })
-            rval.append(dbb)
-
-        return rval
-
-    def cl_geometry_and_textconf(self, items, padding=4):
         p = self
-        max_n_dots = max(len(p.geometry[ii]['dots']) for ii in items)
+        max_n_dots = max(len(p.geometry[ii].dots) for ii in items)
         n_structure_vars = 4 * max_n_dots + 5
         structure_vars_stride = int(
             padding * np.ceil(float(n_structure_vars) / padding))
@@ -197,7 +208,7 @@ def cl_geometry_and_textconf(self, items, padding=4):
         X_starts = p.X.starts
         Y_starts = p.Y.starts
         Y_in_starts = p.Y_in.starts
-        A_stride0s = p.A.stride0s
+        A_strides = [p.A.stride0s, p.A.stride1s][stride]
         A_shape1s = p.A.shape1s
         Y_shape0s = p.Y.shape0s
 
@@ -210,7 +221,7 @@ def cl_geometry_and_textconf(self, items, padding=4):
                 xi, ai = xi[0], ai[0]  # to ignore numpy DeprecationWarning
                 gstructure[bbi, 0 * max_n_dots + ii] = X_starts[xi]
                 gstructure[bbi, 1 * max_n_dots + ii] = A_starts[ai]
-                gstructure[bbi, 2 * max_n_dots + ii] = A_stride0s[ai]
+                gstructure[bbi, 2 * max_n_dots + ii] = A_strides[ai]
                 gstructure[bbi, 3 * max_n_dots + ii] = A_shape1s[ai]
             # -- offset of output and input buffers
             gstructure[bbi, 4 * max_n_dots + 0] = Y_in_starts[bb]
@@ -227,7 +238,7 @@ def cl_geometry_and_textconf(self, items, padding=4):
             'structure_vars_stride': structure_vars_stride,
             'x_starts': 'lstructure[0 * %s + ii]' % max_n_dots,
             'a_starts': 'lstructure[1 * %s + ii]' % max_n_dots,
-            'a_s0': 'lstructure[2 * %s + ii]' % max_n_dots,
+            'a_s%d' % stride: 'lstructure[2 * %s + ii]' % max_n_dots,
             'N_i': 'lstructure[3 * %s + ii]' % max_n_dots,
             'y_in_starts': 'lstructure[4 * %s + 0]' % max_n_dots,
             'y_offset': 'lstructure[4 * %s + 1]' % max_n_dots,
@@ -260,10 +271,8 @@ def ref_impl(p, items):
     if 0:
         if len(items) < 10:
             print('Falling back on reference implementation')
-            p.print_geometry_summary(items, full=True)
         else:
             print('Falling back on reference implementation')
-            p.print_geometry_summary(items)
 
     assert all(s == 1 for s in p.A.stride1s)
     assert all(s == 1 for s in p.X.stride1s)
@@ -372,7 +381,7 @@ def ref_impl(p, items):
         Template(text, output_encoding='ascii').render(**p.__dict__))
 
     gsize = (
-        max(p.geometry[ii]['y_len'] for ii in items),
+        max(p.geometry[ii].y_len for ii in items),
         len(items))
     lsize = None
     fn = cl.Program(p.queue.context, text).build().gemv_ref
@@ -442,35 +451,34 @@ def reduce_impl(p, items,
         raise NotImplementedError()
     if p.cl_gamma is not None:
         raise NotImplementedError()
-    if not all(s == 1 for s in p.A.stride1s):
-        raise NotImplementedError()
+    if not all(s == 1 for s in p.A.stride0s):
+        raise NotImplementedError('A must be in column major')
 
     assert p.float_alpha is not None
     assert p.float_gamma is not None
 
-    cl_gstructure, textconf = p.cl_geometry_and_textconf(items)
-    max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items])
-    max_reduce_len = max(max([gg['a_shape1']
-                              for gg in p.geometry[ii]['dots']])
+    cl_gstructure, textconf = p.cl_geometry_and_textconf(items, stride=1)
+    max_n_dots = max([len(p.geometry[ii].dots) for ii in items])
+    max_reduce_len = max(max([gg.a_shape1
+                              for gg in p.geometry[ii].dots])
                          for ii in items)
-    max_y_len = max([p.geometry[ii]['y_len'] for ii in items])
+    max_y_len = max([p.geometry[ii].y_len for ii in items])
 
     # segment means the piece of Y written by a work-group
     # group_size is the number of values that we're reducing over
 
-    if len(items) < 4:
-        if group_size is None:
-            group_size = 32  # XXX
-        if segment_size is None:
-            segment_size = min(max_y_len, 2)  # XXX
-    else:
-        if group_size is None:
-            group_size = 32  # XXX
-        if segment_size is None:
-            segment_size = min(max_y_len, 4)  # XXX
+    # TODO autotune
+    if group_size is None:
+        group_size = 4
+
+    if segment_size is None:
+        if len(items) < 8:
+            segment_size = min(max_y_len, 8)  # XXX
+        else:
+            segment_size = min(max_y_len, 32)  # XXX
     g_segments = int(np.ceil(float(max_y_len) / segment_size))
-    gsize = (group_size, g_segments * segment_size, len(items))
-    lsize = (group_size, segment_size, 1)
+    gsize = (g_segments * segment_size, group_size, len(items))
+    lsize = (segment_size, group_size, 1)
 
     max_reduce_iters = int(np.ceil(float(max_reduce_len) / group_size))
     textconf.update({
@@ -481,8 +489,8 @@ def reduce_impl(p, items,
         'group_size': group_size,
         'local_count': group_size * segment_size,
         'max_reduce_len': max_reduce_len,
-        'N_cutoff': max_reduce_iters * group_size,
         'max_n_dots': max_n_dots,
+        'log2_group_size': int(np.ceil(np.log2(group_size))),
     })
     if 0:
         for k, v in textconf.items():
@@ -501,19 +509,18 @@ def reduce_impl(p, items,
             const __global ${Y_in.cl_buf.ctype} *Y_in_data,
             __global ${Y.cl_buf.ctype} *Y_data)
     {
+        const int i = get_local_id(0);
+        const int j = get_local_id(1);
+        const int m = get_global_id(0);
+
         __local int lstructure[${n_structure_vars}];
-    % if segment_size > 1:
-        // we'll cache X in shared memory so we load it only once
-        // for the whole segment
-        __local ${X.cl_buf.ctype} lX[${group_size}];
-    % endif
+
         //Scratch space for the dot products
         __local ${Y.cl_buf.ctype}
-            partialDotProduct[${segment_size}][${group_size}];
+            sums[${group_size}][${segment_size}];
         __local ${Y.cl_buf.ctype}
             y_sum_pre[${segment_size}];
-        const int local_idx = get_local_id(0)
-            + get_local_id(1) * get_local_size(0);
+        const int local_idx = i + j * get_local_size(0);
 
         // load structure
     % if local_count < n_structure_vars:
@@ -524,7 +531,7 @@ def reduce_impl(p, items,
             lstructure[ii] = gstructure[
                 get_global_id(2) * ${structure_vars_stride} + ii];
         }
-    % else :
+    % else:
         if (local_idx < ${n_structure_vars})
         {
             lstructure[local_idx] = gstructure[
@@ -533,25 +540,25 @@ def reduce_impl(p, items,
     % endif
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len}))
+        if (j == 0 && m < ${y_len})
         {
     % if float_beta is not None and float_beta != 0 :
-            y_sum_pre[get_local_id(1)] = ${float_beta}
-                * Y_in_data[${y_in_starts} + get_global_id(1)];
+            y_sum_pre[i] = ${float_beta}
+                * Y_in_data[${y_in_starts} + m];
     % elif cl_beta is not None:
-            y_sum_pre[get_local_id(1)] = betas[${bb}]
-                * Y_in_data[${y_in_starts} + get_global_id(1)];
+            y_sum_pre[i] = betas[${bb}]
+                * Y_in_data[${y_in_starts} + m];
     % else :
-            y_sum_pre[get_local_id(1)] = 0;
+            y_sum_pre[i] = 0;
     % endif
 
     % if float_gamma is not None and float_gamma != 0:
-            y_sum_pre[get_local_id(1)] += ${float_gamma};
+            y_sum_pre[i] += ${float_gamma};
     % endif
-    // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]);
+    // printf("betaY + gamma=%f\\n", y_sum_pre[i]);
         }
 
-        partialDotProduct[get_local_id(1)][get_local_id(0)] = 0;
+        sums[j][i] = 0;
     % if max_n_dots > 1:
         for (int ii = 0;
                  ii < ${n_dot_products};
@@ -562,57 +569,48 @@ def reduce_impl(p, items,
     % endif
 
 
-        for (int nn = get_local_id(0);
-                 nn < ${N_cutoff};
-                 nn += get_local_size(0))
-        {
-    // segment_size = ${segment_size}
-    % if (segment_size == 1):
-            if ((nn < ${N_i}) && (get_global_id(1) < ${y_len}))
-            {
-            partialDotProduct[get_local_id(1)][get_local_id(0)] +=
-                A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn]
-                * X_data[${x_starts} + nn];
-            }
-    % else:
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if ((get_local_id(1) == 0) && (nn < ${N_i}))
-            {
-                lX[get_local_id(0)] = X_data[${x_starts} + nn];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if ((nn < ${N_i}) && (get_global_id(1) < ${y_len}))
+
+        __global ${A.cl_buf.ctype}* a = A_data + ${a_starts} + m;
+
+    % if segment_size >= 4:
+        // we'll cache X in shared memory so we load it only once
+        // for the whole segment
+        __local ${X.cl_buf.ctype} lX[${max_reduce_len}];
+        for (int k = local_idx; k < ${N_i}; k += ${local_count})
+            lX[k] = X_data[${x_starts} + k];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    % endif
+
+        if (m < ${y_len}) {
+            for (int k = get_global_id(1); k < ${N_i}; k += get_global_size(1))
             {
-            partialDotProduct[get_local_id(1)][get_local_id(0)] +=
-                A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn]
-                * lX[get_local_id(0)];
+        // segment_size = ${segment_size}
+        % if segment_size < 4:
+                sums[j][i] += a[${a_s1} * k] * X_data[${x_starts} + k];
+        % else:
+                sums[j][i] += a[${a_s1} * k] * lX[k];
+        % endif
             }
-    % endif
         }
 
     % if (max_n_dots > 1):
         }
     % endif
 
-        // -- Parallel reduction long work-group dimension 0
-        for (uint stride = 1;
-                  stride < get_local_size(0);
-                  stride *= 2)
-        {
+        // -- Parallel reduction
+        % for ks in range(log2_group_size - 1, -1, -1):
             barrier(CLK_LOCAL_MEM_FENCE);
-
-            uint index = 2 * stride * get_local_id(0);
-            if (index + stride < get_local_size(0))
-            {
-                partialDotProduct[get_local_id(1)][index] +=
-                    partialDotProduct[get_local_id(1)][index + stride];
+            if (j < ${2**ks}) {
+                sums[j][i] += sums[j + ${2**ks}][i];
+        % if ks == 0:
+                if (m < ${y_len}) {
+                    Y_data[${y_offset} + m] = y_sum_pre[i]
+                        + ${float_alpha} * sums[0][i];
+                }
+        % endif
             }
-        }
-        // barrier(CLK_LOCAL_MEM_FENCE);
-        if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) {
-            Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)]
-                + ${float_alpha} * partialDotProduct[get_local_id(1)][0];
-        }
+        % endfor
+
     }
         """
 
@@ -640,7 +638,7 @@ def reduce_impl(p, items,
                 flops_per_call=flops_from_geometry(p.geometry, items),
                 )
     rval.full_args = full_args  # prevent GC the args
-    rval.description = p.geometry_summary(items)
+    rval.description = p.geometry.summary(items)
     return rval
 
 
@@ -671,7 +669,7 @@ def many_dots_impl(p, items):
         raise NotImplementedError()
     if p.cl_gamma is not None:
         raise NotImplementedError()
-    if not all(s == 1 for s in p.A.stride1s):
+    if not all(s == 1 for s in p.A.stride0s):
         raise NotImplementedError()
 
     assert p.float_alpha is not None
@@ -681,12 +679,12 @@ def many_dots_impl(p, items):
         # -- easy probably, but not done
         raise NotImplementedError()
     A_js_shape0s = p.A_js.shape0s
-    cl_gstructure, textconf = p.cl_geometry_and_textconf(items)
+    cl_gstructure, textconf = p.cl_geometry_and_textconf(items, stride=1)
 
     # min_n_dots = min(A_js_shape0s)
     max_n_dots = max(A_js_shape0s)
 
-    max_y_len = max(p.geometry[ii]['y_len'] for ii in items)
+    max_y_len = max(p.geometry[ii].y_len for ii in items)
     MAX_SEGMENT_SIZE = 16  # tricky to tune?
 
     segment_size = min(
@@ -713,13 +711,7 @@ def many_dots_impl(p, items):
         'segment_idx': 'segment_idx',
         'dot_block_idx': 'dot_block_idx',
     })
-    if 0:
-        for k, v in textconf.items():
-            print(k, v)
     textconf.update(p.__dict__)
-    #    print('float_gamma', textconf['float_gamma'])
-    #    print('cl_gamma', textconf['cl_gamma'])
-    #    print('clra_gamma', textconf['clra_gamma'])
 
     text = """
         __kernel void gemv_many_dots(
@@ -778,11 +770,13 @@ def many_dots_impl(p, items):
                      ii < ${n_dot_products};
                      ii += ${dot_block_size})
             {
-                for (int nn = 0; nn < ${N_i}; nn += 1)
+                const __global ${A.cl_buf.ctype}* a = A_data + ${a_starts}
+                                                      + get_global_id(0);
+                const __global ${X.cl_buf.ctype}* x = X_data + ${x_starts};
+                for (int nn = 0; nn < ${N_i}; nn++)
                 {
-                    y_sum_post[dot_block_idx][segment_idx]
-                    += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn]
-                       * X_data[${x_starts} + nn];
+                    y_sum_post[dot_block_idx][segment_idx] +=
+                        a[${a_s1} * nn] * x[nn];
                 }
             }
         }
@@ -825,30 +819,30 @@ def many_dots_impl(p, items):
                 flops_per_call=flops_from_geometry(p.geometry, items),
                 )
     rval.full_args = full_args  # prevent GC the args
-    rval.description = p.geometry_summary(items)
+    rval.description = p.geometry.summary(items)
     return rval
 
 
 def block_impl(p, items):
 
     if p.clra_alpha is not None:
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: clra_alpha not supported')
     if p.clra_gamma is not None:
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: clra_gamma not supported')
     if p.clra_beta is not None:
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: clra_beta not supported')
     if p.cl_alpha is not None:
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: cl_alpha not supported')
     if p.cl_beta is not None:
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: cl_beta not supported')
     if p.cl_gamma is not None:
-        raise NotImplementedError()
-    if not all(s == 1 for s in p.A.stride1s):
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: cl_gamma not supported')
+    if not all(s == 1 for s in p.A.stride0s):
+        raise NotImplementedError('block_impl: A must be in column-major')
 
     if p.A_js is None:
         # -- easy probably, but not done
-        raise NotImplementedError()
+        raise NotImplementedError('block_impl: A_js cannot be None')
 
     # --- blocking
     # We want to group the dot products into blocks, so that each workgroup
@@ -857,7 +851,7 @@ def block_impl(p, items):
     # region of this buffer, then reduce across the buffer in a separate kernel
 
     # block_y = 8
-    block_y = 32
+    block_y = 128
     # block_x = 32
     block_x = 128
 
@@ -924,7 +918,7 @@ def block_impl(p, items):
             bw_reduce += shape0i*(len(Ybufind_reduce) + 1) * p.Y.dtype.itemsize
 
     # --- create structure
-    gstructure = np.column_stack([shape0s, shape1s, Astride0s, Astride1s,
+    gstructure = np.column_stack([shape0s, shape1s, Astride1s,
                                   Astarts, Xstride0s, Xstarts, Ybufstarts])
     cl_gstructure = to_device(p.queue, gstructure.astype(np.int32))
 
@@ -936,7 +930,7 @@ def block_impl(p, items):
     lsize0_log2 = int(np.log2(lsize0))
     assert 2**lsize0_log2 == lsize0
 
-    lsize = (lsize0, block_y, 1)
+    lsize = (block_y, lsize0, 1)
     gsize = (lsize[0], lsize[1], gstructure.shape[0])
     assert np.prod(lsize) >= block_x
 
@@ -947,12 +941,11 @@ def block_impl(p, items):
         n_structure_vars=gstructure.shape[1],
         shape0='lstructure[0]',
         shape1='lstructure[1]',
-        Astride0='lstructure[2]',
-        Astride1='lstructure[3]',
-        Astart='lstructure[4]',
-        Xstride0='lstructure[5]',
-        Xstart='lstructure[6]',
-        Ybufstart='lstructure[7]',
+        Astride1='lstructure[2]',
+        Astart='lstructure[3]',
+        Xstride0='lstructure[4]',
+        Xstart='lstructure[5]',
+        Ybufstart='lstructure[6]',
         block_y=block_y,
         block_x=block_x,
         lsize0=lsize0,
@@ -975,9 +968,9 @@ def block_impl(p, items):
         __global ${Ybuf.ctype} *Ybufdata
         )
     {
-        const int j = get_global_id(0);
-        const int i = get_global_id(1);
-        const int n = get_global_id(2);
+        const int i = get_global_id(0);       // Row index
+        const int j = get_global_id(1);       // Column group index
+        const int item_i = get_global_id(2);
 
         // load structure
         __local int lstructure[${n_structure_vars}];
@@ -985,7 +978,7 @@ def block_impl(p, items):
             get_local_id(0) + get_local_id(1)*get_local_size(0);
         if (local_idx < ${n_structure_vars})
             lstructure[local_idx] = gstructure[
-                n * ${n_structure_vars} + local_idx];
+                item_i * ${n_structure_vars} + local_idx];
         barrier(CLK_LOCAL_MEM_FENCE);
 
         __global const ${X.ctype} *x = Xdata + ${Xstart};
@@ -997,25 +990,33 @@ def block_impl(p, items):
             xlocal[local_idx] = x[local_idx*${Xstride0}];
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        __local ${Ybuf.ctype} sums[${block_y}][${lsize0}];
-        sums[i][j] = 0;
+        __local ${Ybuf.ctype} sums[${lsize0}][${block_y}];
+        sums[j][i] = 0;
+
+        const int ncols = max(1, (int)(${shape1} / get_local_size(1)));
+        const int jj_max = j + 1 < get_local_size(1) ?
+                             min((j+1) * ncols, ${shape1}) : ${shape1};
+
+        __global const ${A.ctype} *Ai = Adata + ${Astart} + i;
 
         if (i < ${shape0}) {
-            __global const ${A.ctype} *Ai = Adata + ${Astart} + i*${Astride0};
-            for(int jj = j; jj < ${shape1}; jj += get_global_size(0)) {
-                sums[i][j] += Ai[jj*${Astride1}] * xlocal[jj];
+            for(int jj = j * ncols; jj < jj_max; jj++) {
+                sums[j][i] += Ai[jj*${Astride1}] * xlocal[jj];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
-    % for k in range(lsize0_log2 - 1, 0, -1):
-        if (j < ${2**k})
-            sums[i][j] += sums[i][${2**k} + j];
+    % for k in range(lsize0_log2 - 1, -1, -1):
         barrier(CLK_LOCAL_MEM_FENCE);
+        if (j < ${2**k}) {
+            sums[j][i] += sums[j + ${2**k}][i];
+        % if k == 0:
+            if (i < ${shape0}) {
+                ybuf[i] = ${float_alpha} * sums[j][i];
+            }
+       % endif
+       }
     % endfor
-
-        if (i < ${shape0} && j == 0)
-            ybuf[i] = ${float_alpha} * (sums[i][0] + sums[i][1]);
     }
     """
 
@@ -1030,7 +1031,7 @@ def block_impl(p, items):
                 flops_per_call=flops_from_geometry(p.geometry, items),
                 )
     plan.full_args = full_args  # prevent GC the args
-    plan.description = p.geometry_summary(items)
+    plan.description = p.geometry.summary(items)
     plan.Ybuf = clYbuf
 
     # --- Reduce kernel
@@ -1121,11 +1122,171 @@ def block_impl(p, items):
                        name='clra_gemv.block_impl_reduce', tag=p.tag)
     plan_reduce.full_args = full_args_reduce  # prevent GC of the args
     plan_reduce.bw_per_call = bw_reduce
-    # plan_reduce.description = p.geometry_summary(items)
+    # plan_reduce.description = p.geometry.summary(items)
 
     return [plan, plan_reduce]
 
 
+def one_thread_per_row_impl(p, items):
+    """Each thread computes the dot product over one row.
+       The rows are grouped consecutively when y_len is small.
+    """
+    if p.clra_alpha is not None:
+        raise NotImplementedError()
+    if p.clra_gamma is not None:
+        raise NotImplementedError()
+    if p.clra_beta is not None:
+        raise NotImplementedError()
+    if p.cl_alpha is not None:
+        raise NotImplementedError()
+    if p.cl_gamma is not None:
+        raise NotImplementedError()
+    if not all(s == 1 for s in p.A.stride0s):
+        raise NotImplementedError('A must be in column major')
+
+    num_items = len(items)
+    assert num_items > 0
+
+    assert p.float_alpha is not None
+    assert p.float_gamma is not None
+
+    if p.A_js is None:
+        # -- easy probably, but not done
+        raise NotImplementedError()
+
+    cl_gstructure, textconf = p.cl_geometry_and_textconf(items,
+                                                         stride=1)
+
+    max_y_len = max(p.geometry[ii].y_len for ii in items)
+    max_n_dots = max(len(p.geometry[ii].dots) for ii in items)
+    all_same_y_len = len(set(p.geometry[ii].y_len for ii in items)) == 1
+
+
+    # Compute in consecutive threads if max_y_len is small,
+    # and all y_len are the same.
+    # TODO autotune
+    if all_same_y_len and max_y_len < 64:
+        consecutive = True
+        gsize = (max_y_len * num_items,)
+    else:
+        consecutive = False
+        gsize = (max_y_len, num_items)
+
+    textconf.update({
+        'max_n_dots': max_n_dots,
+        'max_y_len': max_y_len,
+        'consecutive': consecutive,
+        'all_same_y_len': all_same_y_len,
+    })
+    textconf.update(p.__dict__)
+
+    """
+    int i, item_i;
+% for ix, sprev, s, item_i in zip(range(len(items)), num_rows, num_rows[1:], items):
+    % if ix == 0:
+         if (global_i < ${s})
+    % else:
+         else if (global_i < ${s})
+    % endif
+         {
+            item_i = ${item_i};
+            i = global_i - ${sprev};
+         }
+    % endfor
+    """
+
+    text = """
+    __kernel void gemv(
+                const __global int* gstructure,
+                const __global ${A.cl_buf.ctype}* A_data,
+                const __global ${X.cl_buf.ctype}* X_data,
+            % if cl_beta is not None:
+                const __global ${cl_beta.ctype}* betas,
+            % endif
+                const __global ${Y_in.cl_buf.ctype}* Y_in_data,
+                __global ${Y.cl_buf.ctype}* Y_data)
+    {
+    % if consecutive:
+        const int global_i = get_global_id(0);
+
+        const int item_i = global_i / ${max_y_len};     // Item index
+        const int i = global_i - item_i * ${max_y_len}; // Row index within item
+    % else:
+        const int item_i = get_global_id(1);  // Item index
+        const int i = get_global_id(0);       // Row index within item
+    % endif
+
+        const __global int* lstructure =
+            gstructure + item_i * ${structure_vars_stride};
+
+    % if not all_same_y_len:
+        if (i >= ${y_len}) return;
+    % endif
+
+        ${Y.cl_buf.ctype} sum = 0;
+
+    % if max_n_dots > 1:
+        for (int ii = 0; ii < ${n_dot_products}; ii++)
+        {
+    % else:
+        const int ii = 0;
+    % endif
+
+            const int a_s1 = ${a_s1};
+            const int n_i = ${N_i};
+
+            const __global ${A.cl_buf.ctype}* a = A_data + ${a_starts} + i;
+            const __global ${X.cl_buf.ctype}* x = X_data + ${x_starts};
+
+            for (int j=0;j<n_i;j++)
+                sum += a[a_s1*j] * x[j];
+
+    % if max_n_dots > 1:
+        }
+    % endif
+
+    % if float_gamma is not None:
+        const ${Y.cl_buf.ctype} gamma = ${float_gamma};
+    % else:
+        const ${Y.cl_buf.ctype} gamma = 0;
+    % endif
+
+    % if float_beta is not None and float_beta != 0 :
+        Y_data[${y_offset} + i] = ${float_alpha} * sum + ${float_beta} * Y_in_data[${y_in_starts} + i] + gamma;
+    % elif cl_beta is not None:
+        Y_data[${y_offset} + i] = ${float_alpha} * sum + betas[${bb}] * Y_in_data[${y_in_starts} + i] + gamma;
+    % else:
+        Y_data[${y_offset} + i] = ${float_alpha} * sum + gamma;
+    % endif
+    }
+    """
+
+    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))
+
+    fn = cl.Program(p.queue.context, text).build().gemv
+
+    full_args = [
+        cl_gstructure,
+        p.A.cl_buf,
+        p.X.cl_buf,
+    ] + ([p.cl_beta] if p.cl_beta is not None else []) + [
+        p.Y_in.cl_buf,
+        p.Y.cl_buf,
+    ]
+
+    fn.set_args(*[arr.data for arr in full_args])
+    rval = Plan(p.queue, fn, gsize, None,
+                name='clra_gemv.one_thread_per_row_impl',
+                tag=p.tag,
+                bw_per_call=bw_from_geometry(p.geometry, items),
+                flops_per_call=flops_from_geometry(p.geometry, items),
+                )
+    rval.full_args = full_args  # prevent GC the args
+    rval.description = p.geometry.summary(items)
+    return rval
+
+
+
 class plan_ref_gemv(gemv_prog):
     def choose_plans(self):
         return [ref_impl(self, range(len(self.Y)))]
@@ -1156,9 +1317,9 @@ def choose_plans(self):
 
         long_dots = [
             ii for ii in remaining_items
-            if len(self.geometry[ii]['dots']) <= 2
-            and max([0] + [dct['a_shape1']
-                           for dct in self.geometry[ii]['dots']]) > 16]
+            if len(self.geometry[ii].dots) <= 2
+            and max([0] + [dct.a_shape1
+                           for dct in self.geometry[ii].dots]) > 16]
         if long_dots:
             try:
                 long_plan = reduce_impl(self, long_dots)
@@ -1172,7 +1333,7 @@ def choose_plans(self):
 
         # many_dots = [ii
             # for ii in remaining_items
-            # if len(self.geometry[ii]['dots']) > 3]
+            # if len(self.geometry[ii].dots) > 3]
         many_dots = remaining_items
         if many_dots:
             try:
@@ -1191,3 +1352,88 @@ def choose_plans(self):
             plans.append(remaining_plan)
 
         return plans
+
+class plan_one_thread_per_row_gemv(gemv_prog):
+    def choose_plans(self):
+        return [one_thread_per_row_impl(self, range(len(self.Y)))]
+
+class plan_pretuned_gemv(gemv_prog):
+    PLANS = (one_thread_per_row_impl, reduce_impl, many_dots_impl, block_impl)
+
+    # Tuning grid obtained on a NVIDIA GTX 970.
+    # The indices correspond to functions in PLANS above.
+    TUNING_GRID = np.array(
+        [  [[ 0, 0, 0, 0, 1, 2, 0]
+          , [ 0, 1, 1, 1, 1, 2, 0]
+          , [ 1, 1, 1, 1, 1, 0, 0]
+          , [ 0, 1, 1, 1, 1, 1, 0]
+          , [ 1, 1, 1, 1, 1, 1, 1]
+          , [ 3, 3, 3, 1, 1, 1, 1]
+          , [ 3, 3, 3, 3, 1, 1, 1]]
+         ,
+           [[ 1, 0, 2, 0, 1, 0, 0]
+          , [ 2, 0, 2, 0, 0, 0, 0]
+          , [ 1, 1, 1, 0, 1, 0, 0]
+          , [ 1, 1, 1, 1, 1, 0, 1]
+          , [ 1, 1, 1, 2, 1, 0, 1]
+          , [ 3, 3, 3, 3, 1, 1, 1]
+          , [ 3, 3, 3, 3, 2, 0, 2]]
+         ,
+           [[ 0, 0, 0, 1, 1, 0, 0]
+          , [ 2, 2, 2, 0, 2, 0, 0]
+          , [ 1, 0, 0, 2, 1, 0, 0]
+          , [ 1, 0, 1, 0, 0, 0, 1]
+          , [ 1, 1, 1, 1, 1, 0, 1]
+          , [ 3, 3, 3, 3, 1, 0, 1]
+          , [ 3, 3, 3, 3, 2, 0, 2]]
+         ,
+           [[ 1, 0, 0, 0, 0, 0, 0]
+          , [ 1, 2, 0, 0, 0, 0, 0]
+          , [ 0, 0, 0, 1, 0, 0, 0]
+          , [ 2, 1, 0, 0, 0, 0, 0]
+          , [ 1, 1, 0, 0, 0, 0, 0]
+          , [ 3, 3, 3, 1, 0, 3, 0]
+          , [ 3, 3, 3, 3, 0, 3, 0]]
+         ,
+           [[ 1, 2, 0, 0, 0, 0, 0]
+          , [ 2, 0, 0, 1, 0, 0, 0]
+          , [ 1, 0, 0, 0, 0, 0, 0]
+          , [ 2, 0, 1, 0, 0, 0, 0]
+          , [ 1, 1, 1, 0, 0, 0, 0]
+          , [ 3, 3, 1, 0, 0, 0, 0]
+          , [ 3, 3, 3, 0, 0, 0, 0]]
+         ,
+           [[ 0, 0, 0, 0, 0, 0, 0]
+          , [ 2, 0, 0, 0, 0, 0, 0]
+          , [ 1, 0, 0, 0, 0, 0, 0]
+          , [ 1, 0, 0, 0, 0, 0, 0]
+          , [ 1, 0, 0, 0, 0, 0, 0]
+          , [ 3, 3, 0, 3, 0, 0, 0]
+          , [ 3, 3, 0, 0, 0, 0, 0]]
+         ,
+           [[ 2, 0, 0, 0, 0, 0, 0]
+          , [ 1, 0, 0, 0, 0, 0, 0]
+          , [ 0, 0, 0, 0, 0, 0, 0]
+          , [ 0, 0, 0, 0, 0, 0, 0]
+          , [ 0, 0, 0, 3, 0, 0, 0]
+          , [ 3, 0, 0, 0, 0, 0, 0]
+          , [ 3, 0, 0, 0, 0, 0, 0]]]
+        , dtype='uint8')
+    LOG_BASE = 4
+
+    def choose_plans(self):
+        items = range(len(self.Y))
+        m, n, k = 1, 1, 1
+        for ii in items:
+            m = max(m, self.geometry[ii].y_len)
+            n = max(n, max(d.a_shape1 for d in self.geometry[ii].dots))
+            k = max(k, len(self.geometry[ii].dots))
+        logm, logn, logk = [
+            min(int(round(math.log(x, plan_pretuned_gemv.LOG_BASE))),
+                plan_pretuned_gemv.TUNING_GRID.shape[i] - 1)
+            for i,x in enumerate((m,n,k))]
+
+        ind = plan_pretuned_gemv.TUNING_GRID[logm, logn, logk]
+        plan = plan_pretuned_gemv.PLANS[ind](self, range(len(self.Y)))
+
+        return plan if isinstance(plan, list) else [plan]
diff --git a/nengo_ocl/clra_nonlinearities.py b/nengo_ocl/clra_nonlinearities.py
index 5fb0f25..311d518 100644
--- a/nengo_ocl/clra_nonlinearities.py
+++ b/nengo_ocl/clra_nonlinearities.py
@@ -27,7 +27,7 @@ def blockify_matrices(max_size, ras):
         assert len(ra) == N
         assert (ra.shape1s == ra0.shape1s).all()
         assert (ra.shape0s == ra0.shape0s).all()
-        assert (ra.shape1s == ra.stride0s).all(), "not contiguous"
+        assert ((ra.shape1s == 1) | (ra.shape0s == ra.stride1s)).all(), "not contiguous"
 
     sizes = []
     inds = []
@@ -123,8 +123,7 @@ def plan_timeupdate(queue, step, time, dt):
 def plan_reset(queue, Y, values, tag=None):
     assert len(Y) == len(values)
 
-    assert (Y.stride0s == Y.shape1s).all()
-    assert (Y.stride1s == 1).all()
+    assert (Y.stride0s == 1).all()
     assert Y.ctype == values.ctype
 
     text = """
@@ -193,6 +192,7 @@ def plan_copy(queue, A, B, incs, tag=None):
 
     for arr in [A, B]:
         assert (arr.shape1s == 1).all()
+        assert (arr.stride0s == 1).all()
     assert (A.shape0s == B.shape0s).all()
 
     assert A.ctype == B.ctype
@@ -203,10 +203,8 @@ def plan_copy(queue, A, B, incs, tag=None):
     % if inc is None:
             __global const int *incdata,
     % endif
-            __global const int *Astrides,
             __global const int *Astarts,
             __global const ${Atype} *Adata,
-            __global const int *Bstrides,
             __global const int *Bstarts,
             __global ${Btype} *Bdata,
             __global const int *sizes
@@ -221,12 +219,12 @@ def plan_copy(queue, A, B, incs, tag=None):
             __global ${Btype} *b = Bdata + Bstarts[n];
 
     % if inc is True:
-            b[i*Bstrides[n]] += a[i*Astrides[n]];
+            b[i] += a[i];
     % elif inc is False:
-            b[i*Bstrides[n]] = a[i*Astrides[n]];
+            b[i] = a[i];
     % else:
-            if (incdata[n])  b[i*Bstrides[n]] += a[i*Astrides[n]];
-            else             b[i*Bstrides[n]] = a[i*Astrides[n]];
+            if (incdata[n])  b[i] += a[i];
+            else             b[i] = a[i];
     % endif
         }
         """
@@ -246,10 +244,8 @@ def plan_copy(queue, A, B, incs, tag=None):
     textconf = dict(Atype=A.ctype, Btype=B.ctype, N=N, inc=None)
 
     full_args = [
-        to_device(queue, A.stride0s[inds]),
         to_device(queue, Astarts),
         A.cl_buf,
-        to_device(queue, B.stride0s[inds]),
         to_device(queue, Bstarts),
         B.cl_buf,
         to_device(queue, sizes),
@@ -278,10 +274,8 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None):
     N = len(A)
     assert len(A) == len(B) == len(Ainds) == len(Binds)
 
-    for arr in [A, B, Ainds, Binds]:
+    for arr in A, B, Ainds, Binds:
         assert (arr.shape1s == 1).all()
-        assert (arr.stride1s == 1).all()
-    for arr in [Ainds, Binds]:
         assert (arr.stride0s == 1).all()
     assert (Ainds.shape0s == Binds.shape0s).all()
 
@@ -294,10 +288,8 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None):
     % if inc is None:
             __global const int *incdata,
     % endif
-            __global const int *Astride0s,
             __global const int *Astarts,
             __global const ${Atype} *Adata,
-            __global const int *Bstride0s,
             __global const int *Bstarts,
             __global ${Btype} *Bdata,
             __global const int *Isizes,
@@ -316,18 +308,17 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None):
             __global ${Btype} *b = Bdata + Bstarts[n];
             __global const int *aind = AIdata + AIstarts[n];
             __global const int *bind = BIdata + BIstarts[n];
-            const int Astride0 = Astride0s[n], Bstride0 = Bstride0s[n];
 
             if (i < Isizes[n]) {
     % if inc is True:
-                b[bind[i]*Bstride0] += a[aind[i]*Astride0];
+                b[bind[i]] += a[aind[i]];
     % elif inc is False:
-                b[bind[i]*Bstride0] = a[aind[i]*Astride0];
+                b[bind[i]] = a[aind[i]];
     % else:
                 if (incdata[n])
-                    b[bind[i]*Bstride0] += a[aind[i]*Astride0];
+                    b[bind[i]] += a[aind[i]];
                 else
-                    b[bind[i]*Bstride0] = a[aind[i]*Astride0];
+                    b[bind[i]] = a[aind[i]];
     % endif
             }
         }
@@ -346,10 +337,8 @@ def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None):
     textconf = dict(Atype=A.ctype, Btype=B.ctype, N=N, inc=None)
 
     full_args = [
-        to_device(queue, A.stride0s[inds]),
         to_device(queue, A.starts[inds]),
         A.cl_buf,
-        to_device(queue, B.stride0s[inds]),
         to_device(queue, B.starts[inds]),
         B.cl_buf,
         to_device(queue, sizes),
@@ -385,14 +374,11 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None):
     assert len(Y) == N and len(A) == N
 
     for arr in [A, X, Y]:
-        assert (arr.stride1s == 1).all()
+        assert (arr.stride0s == 1).all()
     assert ((X.shape0s == 1) | (X.shape0s == Y.shape0s)).all()
     assert ((X.shape1s == 1) | (X.shape1s == Y.shape1s)).all()
     assert ((A.shape0s == 1) | (A.shape0s == Y.shape0s)).all()
     assert ((A.shape1s == 1) | (A.shape1s == Y.shape1s)).all()
-    assert (X.stride1s == 1).all()
-    assert (Y.stride1s == 1).all()
-    assert (A.stride1s == 1).all()
 
     assert X.ctype == Y.ctype
     assert A.ctype == Y.ctype
@@ -400,35 +386,35 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None):
     text = """
         inline ${Ytype} get_element(
             __global const ${Ytype} *data,
-            const int shape0, const int shape1, const int stride0,
+            const int shape0, const int shape1, const int stride1,
             const int i, const int j
         )
         {
             if (shape0 == 1 && shape1 == 1)
                 return data[0];
             else if (shape0 == 1)
-                return data[j];
+                return data[j * stride1];
             else if (shape1 == 1)
-                return data[i * stride0];
+                return data[i];
             else
-                return data[i * stride0 + j];
+                return data[i + j * stride1];
         }
 
         ////////// MAIN FUNCTION //////////
         __kernel void elementwise_inc(
             __global const int *Ashape0s,
             __global const int *Ashape1s,
-            __global const int *Astride0s,
+            __global const int *Astride1s,
             __global const int *Astarts,
             __global const ${Atype} *Adata,
             __global const int *Xshape0s,
             __global const int *Xshape1s,
-            __global const int *Xstride0s,
+            __global const int *Xstride1s,
             __global const int *Xstarts,
             __global const ${Xtype} *Xdata,
             __global const int *Yshape0s,
             __global const int *Yshape1s,
-            __global const int *Ystride0s,
+            __global const int *Ystride1s,
             __global const int *Ystarts,
             __global ${Ytype} *Ydata
         )
@@ -440,24 +426,24 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None):
 
             const int Ashape0 = Ashape0s[n];
             const int Ashape1 = Ashape1s[n];
-            const int Astride0 = Astride0s[n];
+            const int Astride1 = Astride1s[n];
             const int Xshape0 = Xshape0s[n];
             const int Xshape1 = Xshape1s[n];
-            const int Xstride0 = Xstride0s[n];
-            const int Yshape1 = Yshape1s[n];
-            const int Ystride0 = Ystride0s[n];
-            const int Ysize = Yshape0s[n] * Yshape1;
+            const int Xstride1 = Xstride1s[n];
+            const int Yshape0 = Yshape0s[n];
+            const int Ystride1 = Ystride1s[n];
+            const int Ysize = Yshape1s[n] * Yshape0;
 
             for (int ij = get_global_id(0);
                  ij < Ysize;
                  ij += get_global_size(0))
             {
-                int i = ij / Yshape1;
-                int j = ij % Yshape1;
+                const int i = ij % Yshape0;
+                const int j = ij / Yshape0;
 
-                ${Atype} aa = get_element(a, Ashape0, Ashape1, Astride0, i, j);
-                ${Xtype} xx = get_element(x, Xshape0, Xshape1, Xstride0, i, j);
-                y[i * Ystride0 + j] += aa * xx;
+                const ${Atype} aa = get_element(a, Ashape0, Ashape1, Astride1, i, j);
+                const ${Xtype} xx = get_element(x, Xshape0, Xshape1, Xstride1, i, j);
+                y[i + j * Ystride1] += aa * xx;
             }
         }
         """
@@ -468,17 +454,17 @@ def plan_elementwise_inc(queue, A, X, Y, tag=None):
     full_args = (
         A.cl_shape0s,
         A.cl_shape1s,
-        A.cl_stride0s,
+        A.cl_stride1s,
         A.cl_starts,
         A.cl_buf,
         X.cl_shape0s,
         X.cl_shape1s,
-        X.cl_stride0s,
+        X.cl_stride1s,
         X.cl_starts,
         X.cl_buf,
         Y.cl_shape0s,
         Y.cl_shape1s,
-        Y.cl_stride0s,
+        Y.cl_stride1s,
         Y.cl_starts,
         Y.cl_buf,
     )
@@ -510,8 +496,7 @@ def plan_linearfilter(queue, X, Y, A, B, Xbuf, Ybuf, tag=None):
     assert len(Y) == N and len(A) == N and len(B) == N
 
     for arr in [X, Y, A, B, Xbuf, Ybuf]:
-        assert (arr.shape1s == arr.stride0s).all()
-        assert (arr.stride1s == 1).all()
+        assert (arr.stride0s == 1).all()
     for arr in [X, Y, A, B]:  # vectors
         assert (arr.shape1s == 1).all()
     assert (X.shape0s == Y.shape0s).all()
@@ -672,14 +657,11 @@ def plan_probes(queue, periods, X, Y, tag=None):
     N = len(X)
 
     # N.B.  X[i].shape = (M, N)
-    #       Y[i].shape = (buf_len, M * N)
+    #       Y[i].shape = (M * N, buflen)
     for arr in [X, Y]:
-        assert (arr.stride1s == 1).all()
-    assert (X.shape0s * X.shape1s == Y.shape1s).all()
-    assert (X.stride0s == X.shape1s).all()
-    assert (X.stride1s == 1).all()
-    assert (Y.stride0s == Y.shape1s).all()
-    assert (Y.stride1s == 1).all()
+        assert (arr.stride0s == 1).all()
+    assert (X.shape0s * X.shape1s == Y.shape0s).all()
+    assert (Y.stride1s == Y.shape0s).all()
 
     periods = np.asarray(periods, dtype='float32')
     cl_periods = to_device(queue, periods)
@@ -782,7 +764,7 @@ def plan_direct(queue, code, init, input_names, inputs, output, tag=None):
     for x in inputs:
         assert len(x) == len(output)
     for x in inputs + [output]:
-        assert (x.shape1s == 1).all() and (x.stride1s == 1).all()
+        assert (x.shape1s == 1).all() and (x.stride1s == x.shape0s).all()
         assert (x.stride0s == 1).all()
 
     input_types = [x.ctype for x in inputs]
@@ -843,7 +825,7 @@ def plan_direct(queue, code, init, input_names, inputs, output, tag=None):
 
 
 def plan_lif(queue, dt, J, V, W, outS, ref, tau, N=None, tau_n=None,
-             inc_n=None, upsample=1, **kwargs):
+             inc_n=None, upsample=1, fastlif=False, **kwargs):
     adaptive = N is not None
     assert J.ctype == 'float'
     for array in [V, W, outS]:
@@ -864,37 +846,62 @@ def plan_lif(queue, dt, J, V, W, outS, ref, tau, N=None, tau_n=None,
                     dtu=dt/upsample, dtu_inv=upsample/dt, dt_inv=1/dt)
     decs = """
         char spiked;
-        ${type} dV, overshoot;
+        ${type} dV;
         const ${type} dtu = ${dtu}, dtu_inv = ${dtu_inv}, dt_inv = ${dt_inv};
 % if adaptive:
         const ${type} dt = ${dt};
 % endif
         const ${type} V_threshold = 1;
+%if fastlif:
+        const ${type} delta_t = dtu;
+%else:
+        ${type} delta_t;
+%endif
         """
     # TODO: could precompute -expm1(-dtu / tau)
     text = """
         spiked = 0;
 
 % for ii in range(upsample):
+        W -= dtu;
+% if not fastlif:
+        if (W > 0)
+            delta_t = max((${type})0, dtu - W);
+        else
+            delta_t = dtu;
+% endif
 % if adaptive:
-        dV = -expm1(-dtu / tau) * (J - N - V);
+        dV = -expm1(-delta_t / tau) * (J - N - V);
 % else:
-        dV = -expm1(-dtu / tau) * (J - V);
+        dV = -expm1(-delta_t / tau) * (J - V);
 % endif
         V += dV;
-        W -= dtu;
 
+% if fastlif:
         if (V < 0 || W > dtu)
             V = 0;
         else if (W >= 0)
             V *= 1 - W * dtu_inv;
+% endif
 
         if (V > V_threshold) {
-            overshoot = dtu * (V - V_threshold) / dV;
+% if fastlif:
+            const ${type} overshoot = dtu * (V - V_threshold) / dV;
             W = ref - overshoot + dtu;
+% else:
+            const ${type} t_spike = dtu + tau * log1p(
+                -(V - V_threshold) / (J - V_threshold));
+            W = ref + t_spike;
+% endif
             V = 0;
             spiked = 1;
         }
+% if not fastlif:
+         else if (V < 0) {
+            V = 0;
+        }
+% endif
+
 % endfor
         outV = V;
         outW = W;
@@ -997,8 +1004,8 @@ def _plan_template(queue, name, core_text, declares="", tag=None,
         assert vname not in avars, "Name clash"
         assert len(v) == len(input0)
         assert (v.shape0s == input0.shape0s).all()
-        assert (v.stride0s == v.shape1s).all()  # rows contiguous
-        assert (v.stride1s == 1).all()  # columns contiguous
+        assert (v.stride0s == 1).all() # rows contiguous
+        assert (v.stride1s == v.shape0s).all() # columns contiguous
         assert (v.shape1s == 1).all()  # vectors only
 
         offset = '%(name)s_starts[gind1]' % {'name': vname}
@@ -1231,7 +1238,7 @@ def plan_whitenoise(queue, Y, dist_enums, dist_params, scale, inc, dt, rngs,
     for arr in [Y, dist_enums, dist_params]:
         assert (arr.shape1s == 1).all()
         assert (arr.stride0s == 1).all()
-        assert (arr.stride1s == 1).all()
+        assert (arr.stride1s == arr.shape0s).all()
 
     assert (dist_enums.shape0s == 1).all()
 
diff --git a/nengo_ocl/clraggedarray.py b/nengo_ocl/clraggedarray.py
index 265be04..d7ab459 100644
--- a/nengo_ocl/clraggedarray.py
+++ b/nengo_ocl/clraggedarray.py
@@ -73,6 +73,7 @@ def __init__(self, queue, np_raggedarray):
         self.stride1s = np_raggedarray.stride1s
         self.buf = np_raggedarray.buf
         self.names = np_raggedarray.names
+        self.order = np_raggedarray.order
 
     @classmethod
     def from_arrays(cls, queue, arrays, names=None, dtype=None, align=False):
@@ -248,9 +249,11 @@ def __setitem__(self, item, new_value):
                 # contiguous
                 clarray = self.getitem_device(item)
                 if isinstance(new_value, np.ndarray):
-                    array = np.asarray(new_value, order='C', dtype=self.dtype)
+                    array = np.asarray(new_value, dtype=self.dtype,
+                                       order=self.order)
                 else:
-                    array = np.zeros(clarray.shape, dtype=clarray.dtype)
+                    array = np.zeros(clarray.shape, dtype=clarray.dtype,
+                                     order=self.order)
                     array[...] = new_value
 
                 array.shape = clarray.shape  # reshape to avoid warning
diff --git a/nengo_ocl/raggedarray.py b/nengo_ocl/raggedarray.py
index e06e1aa..161fa87 100644
--- a/nengo_ocl/raggedarray.py
+++ b/nengo_ocl/raggedarray.py
@@ -33,8 +33,10 @@ class RaggedArray(object):
     in the same underlying buffer.
     """
 
-    def __init__(self, arrays, names=None, dtype=None, align=False):
-        arrays = [np.asarray(a) for a in arrays]
+    def __init__(self, arrays, names=None, dtype=None, align=False, order='F'):
+        assert order in ('C','F')
+
+        arrays = [np.asarray(a, order=order) for a in arrays]
         assert len(arrays) > 0
         assert all(a.ndim <= 2 for a in arrays)
 
@@ -55,16 +57,22 @@ def __init__(self, arrays, names=None, dtype=None, align=False):
         self.starts = starts
         self.shape0s = [a.shape[0] if a.ndim > 0 else 1 for a in arrays]
         self.shape1s = [a.shape[1] if a.ndim > 1 else 1 for a in arrays]
-        self.stride0s = [a.shape[1] if a.ndim == 2 else 1 for a in arrays]
-        self.stride1s = [1 for a in arrays]
+        self.stride0s = [a.strides[0] // a.itemsize
+                         if a.ndim > 0 else 1 for a in arrays]
+        self.stride1s = [a.strides[1] // a.itemsize
+                         if a.ndim > 1 else
+                         (max(a.shape[0], 1) if a.ndim > 0 and order == 'F' else 1)
+                         for a in arrays]
 
         buf = np.zeros(starts[-1] + arrays[-1].size, dtype=dtype)
         for a, s in zip(arrays, starts):
-            buf[s:s+a.size] = a.ravel()
+            buf[s:s+a.size] = a.ravel(order=order)
 
         self.buf = buf
         self.names = names
 
+        self.order = order
+
         self._sizes = None
 
     @classmethod
diff --git a/nengo_ocl/simulator.py b/nengo_ocl/simulator.py
index dd83935..49a99f6 100644
--- a/nengo_ocl/simulator.py
+++ b/nengo_ocl/simulator.py
@@ -24,7 +24,7 @@
 
 from nengo_ocl.raggedarray import RaggedArray
 from nengo_ocl.clraggedarray import CLRaggedArray, to_device
-from nengo_ocl.clra_gemv import plan_block_gemv
+from nengo_ocl.clra_gemv import plan_pretuned_gemv, plan_one_thread_per_row_gemv
 from nengo_ocl.clra_nonlinearities import (
     plan_timeupdate, plan_reset, plan_copy, plan_slicedcopy,
     plan_direct, plan_lif, plan_lif_rate,
@@ -209,10 +209,11 @@ def get_views(self):
 
 class ViewBuilder(object):
 
-    def __init__(self, bases, rarray):
+    def __init__(self, bases, rarray, order='F'):
         self.sidx = {bb: ii for ii, bb in enumerate(bases)}
         assert len(bases) == len(self.sidx)
         self.rarray = rarray
+        self.order = order
 
         self.starts = []
         self.shape0s = []
@@ -238,8 +239,12 @@ def append_view(self, obj):
         self.starts.append(self.rarray.starts[idx] + obj.elemoffset)
         self.shape0s.append(obj.shape[0] if obj.ndim > 0 else 1)
         self.shape1s.append(obj.shape[1] if obj.ndim > 1 else 1)
-        self.stride0s.append(obj.elemstrides[0] if obj.ndim > 0 else 1)
-        self.stride1s.append(obj.elemstrides[1] if obj.ndim > 1 else 1)
+        if self.order == 'F':
+            self.stride0s.append(1)
+            self.stride1s.append(obj.shape[0] if obj.ndim > 0 else 1)
+        else:
+            self.stride0s.append(obj.elemstrides[0] if obj.ndim > 0 else 1)
+            self.stride1s.append(obj.elemstrides[1] if obj.ndim > 1 else 1)
         self.names.append(getattr(obj, 'name', ''))
         self.sidx[obj] = len(self.sidx)
 
@@ -466,7 +471,7 @@ def _sig_gemv(self, ops, A_js_fn, X_js_fn, Y_fn, Y_in_fn=None,
         if callable(beta):
             beta = RaggedArray([sidx[beta(o)] for o in ops], dtype=np.float32)
 
-        rval = plan_block_gemv(
+        rval = plan_one_thread_per_row_gemv(
             self.queue, alpha, all_data, A_js, all_data, X_js, beta, Y,
             Y_in=Y_in, gamma=gamma, tag=tag)
         return rval.plans
@@ -850,7 +855,7 @@ def plan_probes(self):
             X = self.all_data[
                 [self.sidx[self.model.sig[p]['in']] for p in probes]]
             Y = self.RaggedArray(
-                [np.zeros((n_prealloc, self.model.sig[p]['in'].size))
+                [np.zeros((self.model.sig[p]['in'].size, n_prealloc), order='F')
                  for p in probes], dtype=np.float32)
 
             cl_plan = plan_probes(self.queue, periods, X, Y)
@@ -875,8 +880,10 @@ def _probe(self):
                 # XXX: this syntax retrieves *ALL* of Y from the device
                 #      because the :n_buffered only works on the ndarray
                 #      *after* it has been transferred.
-                raw = plan.Y[i][:n_buffered]
-                shaped = raw.reshape((n_buffered,) + shape)
+                raw = plan.Y[i][:,:n_buffered]
+                shaped = raw.reshape(shape + (n_buffered,), order='F')
+                # Transpose to (n_buffered, shape).
+                shaped = shaped.transpose([len(shape)] + list(range(len(shape))))
                 self._probe_outputs[probe].extend(shaped)
         plan.cl_bufpositions.fill(0)
         self.queue.finish()
@@ -908,7 +915,7 @@ def run_steps(self, N, progress_bar=True):
                 N -= B
                 progress.step(n=B)
 
-        if self.profiling > 1:
+        if self.profiling:
             self.print_profiling()
 
     def reset(self, seed=None):
@@ -1043,11 +1050,11 @@ def print_profiling(self, sort=None):
         print('%8s|%10s|%10s|%10s|' % ('n_calls', 'runtime', 'GF/s', 'GB/s'))
 
         for r in table:
-            print('%8d|%10.3f|%10.3f|%10.3f| %s' % r)
+            print('%8d|%10.5f|%10.3f|%10.3f| %s' % r)
 
         # totals totals
         print('-' * 80)
-        col = lambda c: np.asarray(map(lambda x: x[c], table))
+        col = lambda c: np.asarray([x[c] for x in table])
         times = col(1)
 
         def wmean(x):
diff --git a/nengo_ocl/tests/test_clra_gemv.py b/nengo_ocl/tests/test_clra_gemv.py
index e8c2ab2..20eb533 100644
--- a/nengo_ocl/tests/test_clra_gemv.py
+++ b/nengo_ocl/tests/test_clra_gemv.py
@@ -11,7 +11,7 @@
 from nengo_ocl.clraggedarray import CLRaggedArray as CLRA
 from nengo_ocl.clra_gemv import (
     plan_reduce_gemv, plan_many_dots_gemv, plan_block_gemv,
-    plan_ragged_gather_gemv)
+    plan_ragged_gather_gemv, plan_one_thread_per_row_gemv)
 
 
 ctx = cl.create_some_context()
@@ -24,7 +24,7 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize(
             "planner", [
                 plan_reduce_gemv, plan_many_dots_gemv, plan_block_gemv,
-                plan_ragged_gather_gemv])
+                plan_ragged_gather_gemv, plan_one_thread_per_row_gemv])
 
 
 def allclose(raA, raB):
@@ -285,20 +285,35 @@ def test_speed(rng):
     A_js = RA(ajs, dtype=np.int32)
     X_js = RA(xjs, dtype=np.int32)
 
-    # -- run cl computation
-    prog = plan_ragged_gather_gemv(
-        queue, alpha, clA, A_js, clX, X_js, beta, clY)
-    plans = prog.choose_plans()
+    # For bandwidth and load computations
+    total_mem_gb = (sum(m*n + m + n + m for m,n in zip(ms,ns)) + 3) * np.dtype('float32').itemsize * 1e-9
+    total_gflops = sum(m* (2*n-1) + 4*m for m,n in zip(ms,ns)) * 1e-9
 
-    print('')
-    print('-' * 5 + ' Plans ' + '-' * 45)
-    for plan in plans:
-        print(plan)
+    # -- run cl computation a few times
+    ntrials = 5
+    total_time = 0.0
+    for i in range(ntrials):
+        prog = plan_ragged_gather_gemv(
+            queue, alpha, clA, A_js, clX, X_js, beta, clY)
+        plans = prog.choose_plans()
 
-    with Timer() as timer:
-        for plan in plans:
-            plan()
-    print("nengo_ocl: %0.3f" % timer.duration)
+        if i == 0:
+            print('')
+            print('-' * 5 + ' Plans ' + '-' * 45)
+            for plan in plans:
+                print(plan)
+
+        with Timer() as timer:
+            for plan in plans:
+                plan()
+
+        total_time += timer.duration
+
+    t = total_time / ntrials
+    print("nengo_ocl: %0.3f s, bandwidth: %.1f GB/s, load: %.1f GFlops"
+              % (t,
+                 total_mem_gb / t,
+                 total_gflops / t))
 
     # -- speed test in ocl blas
     if pyopencl_blas:
@@ -318,23 +333,31 @@ def array(a):
         #           for _ in range(k)]
 
         queue.finish()
-        with Timer() as timer:
-            if 0:
-                # use a single queue
-                for A, X, Y in zip(clAs, clXs, clYs):
-                    pyopencl_blas.gemv(queue, A, X, Y)
-                queue.finish()
-            else:
-                # use multiple parallel queues
-                events = []
-                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
-                    q = queues[i % len(queues)]
-                    e = pyopencl_blas.gemv(q, A, X, Y)
-                    events.append(e)
-                for q in queues:
-                    q.flush()
-                cl.wait_for_events(events)
-        print("clBLAS: %0.3f" % timer.duration)
+
+        total_time = 0.0
+        for i in range(ntrials):
+            with Timer() as timer:
+                if 0:
+                    # use a single queue
+                    for A, X, Y in zip(clAs, clXs, clYs):
+                        pyopencl_blas.gemv(queue, A, X, Y)
+                        queue.finish()
+                else:
+                    # use multiple parallel queues
+                    events = []
+                    for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
+                        q = queues[i % len(queues)]
+                        e = pyopencl_blas.gemv(q, A, X, Y)
+                        events.append(e)
+                    for q in queues:
+                        q.flush()
+                    cl.wait_for_events(events)
+            total_time += timer.duration
+        t = total_time / ntrials
+        print("clBLAS: %0.3f, bandwidth: %.1f GB/s, load: %.1f GFlops"
+              % (t,
+                 total_mem_gb / t,
+                 total_gflops / t))
 
 
 if __name__ == '__main__':