gto fixes.

Robin Skånberg · Robin Skånberg · commit 79670d4da538 · 2025-11-25T14:28:59.000+01:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -101,7 +101,8 @@ set(GTO_SHADER_FILES
     src/shaders/eval_alie.comp
     src/shaders/eval_gto.comp
     src/shaders/eval_gto_density.comp
-    src/shaders/segment_and_attribute_to_group.comp
+    src/shaders/eval_gto_density_grad.comp
+    src/shaders/voronoi_segment.comp
 )
 
 set(MD_DEFINES MD_GL_SPLINE_SUBDIVISION_COUNT=${MD_GL_SPLINE_SUBDIVISION_COUNT})
diff --git a/src/md_gto.c b/src/md_gto.c
@@ -93,11 +93,11 @@ static GLuint get_alie_program(void) {
     return program;
 }
 
-static GLuint get_vol_segment_to_groups_program(void) {
+static GLuint get_voronoi_segment_program(void) {
     static GLuint program = 0;
     if (!program) {
         GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
-        if (md_gl_shader_compile(shader, (str_t){(const char*)segment_and_attribute_to_group_comp, segment_and_attribute_to_group_comp_size}, 0, 0)) {
+        if (md_gl_shader_compile(shader, (str_t){(const char*)voronoi_segment_comp, voronoi_segment_comp_size}, 0, 0)) {
             GLuint prog = glCreateProgram();
             if (md_gl_program_attach_and_link(prog, &shader, 1)) {
                 program = prog;
@@ -247,10 +247,10 @@ void md_gto_grid_evaluate_ALIE_GPU(uint32_t vol_tex, const md_grid_t* vol_grid,
     gto_grid_evaluate_orb_GPU(vol_tex, vol_grid, orb, mode, program);
 }
 
-void md_gto_segment_and_attribute_to_groups_GPU(float* out_group_values, size_t cap_groups, uint32_t vol_tex, const md_grid_t* grid, const float* point_xyzr, const uint32_t* point_group_idx, size_t num_points) {
-    ASSERT(out_group_values);
+void md_gto_voronoi_segment_GPU(float* out_values, const float* point_xyzr, size_t num_points, uint32_t vol_tex, const md_grid_t* grid) {
+    ASSERT(out_values);
     ASSERT(point_xyzr);
-    ASSERT(point_group_idx);
+    ASSERT(grid);
 
     GLenum format = 0;
     if (glGetTextureLevelParameteriv) {
@@ -273,37 +273,30 @@ void md_gto_segment_and_attribute_to_groups_GPU(float* out_group_values, size_t
 
     md_gl_debug_push("SEGMENT VOL TO GROUP");
 
-    GLintptr   ssbo_group_value_offset = 0;
-    GLsizeiptr ssbo_group_value_size   = sizeof(float) * 16;
+    GLintptr   ssbo_point_value_offset = 0;
+    GLsizeiptr ssbo_point_value_size   = sizeof(float) * 16;
 
-    GLintptr   ssbo_point_xyzr_offset  = ALIGN_TO(ssbo_group_value_offset + ssbo_group_value_size, 256);
+    GLintptr   ssbo_point_xyzr_offset  = ALIGN_TO(ssbo_point_value_offset + ssbo_point_value_size, 256);
     GLsizeiptr ssbo_point_xyzr_size    = sizeof(float) * 4 * num_points;
 
-    GLintptr   ssbo_point_group_offset = ALIGN_TO(ssbo_point_xyzr_offset + ssbo_point_xyzr_size, 256);
-    GLsizeiptr ssbo_point_group_size   = sizeof(uint32_t) * num_points;
-
-    size_t total_size = ALIGN_TO(ssbo_point_group_offset + ssbo_point_group_size, 256);
-
+    size_t total_size = ALIGN_TO(ssbo_point_xyzr_offset + ssbo_point_xyzr_size, 256);
     GLuint ssbo = get_buffer(total_size);
     
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
 
-    // Clear first 16 bytes which represents the result (group_values)
-    glClearBufferSubData(GL_SHADER_STORAGE_BUFFER, GL_R32F, ssbo_group_value_offset, ssbo_group_value_size, GL_RED, GL_FLOAT, NULL);
+    // Clear first portion of buffer holding point values
+    glClearBufferSubData(GL_SHADER_STORAGE_BUFFER, GL_R32F, ssbo_point_value_offset, ssbo_point_value_size, GL_RED, GL_FLOAT, NULL);
     // Fill next portion of buffer with point xyzr
     glBufferSubData(GL_SHADER_STORAGE_BUFFER, ssbo_point_xyzr_offset,  ssbo_point_xyzr_size,  point_xyzr);
-    // Fill last portion of buffer with point indices
-    glBufferSubData(GL_SHADER_STORAGE_BUFFER, ssbo_point_group_offset, ssbo_point_group_size, point_group_idx);
 
     glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ssbo, ssbo_point_xyzr_offset,  ssbo_point_xyzr_size);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, ssbo, ssbo_point_group_offset, ssbo_point_group_size);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, ssbo, ssbo_group_value_offset, ssbo_group_value_size);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, ssbo, ssbo_point_value_offset, ssbo_point_value_size);
 
     glBindImageTexture(0, vol_tex, 0, GL_TRUE, 0, GL_READ_ONLY, format);
 
     glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
 
-    GLuint program = get_vol_segment_to_groups_program();
+    GLuint program = get_voronoi_segment_program();
     glUseProgram(program);
 
     float world_to_model[4][4];
@@ -321,18 +314,16 @@ void md_gto_segment_and_attribute_to_groups_GPU(float* out_group_values, size_t
         DIV_UP(grid->dim[1], 8),
         DIV_UP(grid->dim[2], 8),
     };
-
     glDispatchCompute(num_groups[0], num_groups[1], num_groups[2]);
 
     glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
-    uint32_t temp_group_values[16];
-    glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, ssbo_group_value_offset, ssbo_group_value_size, temp_group_values);
-
-    for (size_t i = 0; i < MIN(cap_groups, 16); ++i) {
-        double value = temp_group_values[i] / QUANTIZATION_SCALE_FACTOR;
-        out_group_values[i] = (float)value;
+    uint32_t* temp_values = (uint32_t*)md_temp_push(sizeof(uint32_t) * num_points);
+    glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, ssbo_point_value_offset, ssbo_point_value_size, temp_values);
+    for (size_t i = 0; i < num_points; ++i) {
+        out_values[i] = (float)(temp_values[i] / QUANTIZATION_SCALE_FACTOR);
     }
+    md_temp_pop(sizeof(uint32_t) * num_points);
 
     glUseProgram(0);
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
@@ -485,7 +476,7 @@ void md_gto_grid_evaluate_matrix_GPU(uint32_t vol_tex, const md_grid_t* grid, co
     GLuint64 elapsedTime = 0;
     glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsedTime); // nanoseconds
 
-	MD_LOG_DEBUG("GTO Density evaluation GPU time: %.3f ms", elapsedTime / 1e6);
+	MD_LOG_DEBUG("GTO Density evaluation of [%i,%i,%i] GPU time: %.3f ms", grid->dim[0], grid->dim[1], grid->dim[2], elapsedTime / 1e6);
 
     glUseProgram(0);
 
diff --git a/src/md_gto.h b/src/md_gto.h
@@ -91,15 +91,13 @@ void md_gto_grid_evaluate_orb_GPU(uint32_t vol_tex, const md_grid_t* vol_grid, c
 void md_gto_grid_evaluate_ALIE_GPU(uint32_t vol_tex, const md_grid_t* vol_grid, const md_orbital_data_t* orb, md_gto_eval_mode_t mode);
 
 // This is malplaced at the moment, but this is for the moment, the best match in where to place the functionality
-// Performs voronoi segmentation of the supplied volume to points with a supplied radius and accumulates the value of each voxel into the corresponding group of the closest point
-// - out_group_values: Destination array holding the group values that are written to
-// - cap_groups: Capacity of group array
-// - vol_tex: The texture handle to the volume
-// - vol_grid: The grid defining the volume
+// Performs voronoi segmentation of the supplied volume to points with a supplied radius and accumulates the value of each voxel into the corresponding point
+// - out_values: Destination array holding the point values that are written to, should have length 'num_points'
 // - point_xyzr: Point coordinates + radius, packed xyzrxyzrxyzr
-// - point_group_idx: Point group index [0, num_groups-1]
 // - num_points: Number of points
-void md_gto_segment_and_attribute_to_groups_GPU(float* out_group_values, size_t cap_groups, uint32_t vol_tex, const md_grid_t* vol_grid, const float* point_xyzr, const uint32_t* point_group_idx, size_t num_points);
+// - vol_tex: The texture handle to the volume
+// - vol_grid: The grid defining the volume
+void md_gto_voronoi_segment_GPU(float* out_values, const float* point_xyzr, size_t num_points, uint32_t vol_tex, const md_grid_t* vol_grid);
 
 // Evaluates GTOs over a grid
 // - out_grid_values: The grid to write the evaluated values to, should have length 'grid->dim[0] * grid->dim[1] * grid->dim[2]'
diff --git a/src/shaders/eval_gto_density.comp b/src/shaders/eval_gto_density.comp
@@ -1,9 +1,5 @@
 #version 430 core
 
-#extension GL_KHR_shader_subgroup_basic  : enable
-#extension GL_KHR_shader_subgroup_ballot : enable
-#extension GL_KHR_shader_subgroup_arithmetic : enable
-
 struct PGTO {
     float coeff;
     float alpha;
@@ -48,7 +44,7 @@ layout(std140, binding = 0) uniform UniformBlock {
 
 layout(binding = 0) writeonly restrict uniform image3D out_vol;
 
-#if 0
+#if 1
 float safe_pow(float base, uint exponent) {
     switch(exponent) {
         case 0u: return 1.0;
@@ -60,7 +56,11 @@ float safe_pow(float base, uint exponent) {
             return b2 * b2;
         }
         default: {
-            return 1.0;
+            float v = 1.0;
+            for (uint e = 0u; e < exponent; ++e) {
+                v *= base;
+            }
+            return v;
         }
     }
 }
@@ -96,7 +96,11 @@ uint pack_offset_length(uint offset, uint length) {
     return (length << 24u) | offset;
 }
 
-#define WG_SIZE 512
+#define WG_X 8
+#define WG_Y 8
+#define WG_Z 8
+
+#define WG_SIZE (WG_X * WG_Y * WG_Z)
 #define TILE_SIZE 32
 #define TILE_AREA (TILE_SIZE*TILE_SIZE)
 #define MAX_SCREENED_CGTOS 2048
@@ -119,26 +123,28 @@ uint get_index(uint i, uint j, uint N) {
     return row_offset + (col - row);
 }
 
-// Populate the D GSM from the global D_matrix
+// Full D_tile from global D_matrix
 void fill_D_tile(uint tile_i, uint tile_j) {
-    uint tid = gl_LocalInvocationIndex;
-    if (tid >= TILE_SIZE) return;
+    const uint tid  = gl_LocalInvocationIndex;
 
     uint baseI = tile_i * TILE_SIZE;
     uint baseJ = tile_j * TILE_SIZE;
 
-    uint idxI = baseI + tid;
-    uint gi = (idxI < MAX_SCREENED_CGTOS) ? screened_cgtos[idxI] : INVALID_CGTO_IDX;
+    for (uint index = tid; index < TILE_AREA; index += WG_SIZE) {
+        uint row = index / TILE_SIZE;
+        uint col = index % TILE_SIZE;
+
+        uint idxI = baseI + row;
+        uint gi = (idxI < MAX_SCREENED_CGTOS) ? screened_cgtos[idxI] : INVALID_CGTO_IDX;
 
-    for (uint col = 0; col < TILE_SIZE; ++col) {
         uint idxJ = baseJ + col;
         uint gj = (idxJ < MAX_SCREENED_CGTOS) ? screened_cgtos[idxJ] : INVALID_CGTO_IDX;
 
         float value = 0.0;
         if (gi != INVALID_CGTO_IDX && gj != INVALID_CGTO_IDX) {
             value = D_matrix[get_index(gi, gj, D_matrix_dim)];
         }
-        D_tile[tid][col] = value;
+        D_tile[row][col] = value;
     }
 }
 
@@ -162,29 +168,27 @@ void fill_cgtos_tile(uint tile_number, vec3 model_aabb_min, vec3 model_aabb_max)
         uint src_end = cgto_offset[global_cgto_idx + 1u];
         uint src_len = src_end - src_beg;
 
-        if (src_beg != src_end) {
-            // Reserve contiguous space for this CGTO's PGTOs
-            uint dst = atomicAdd(num_pgtos, src_len);
-
-            cgto_center = cgto_xyzr[global_cgto_idx].xyz;
-            cgto_pgto_off = dst;
-            
-            vec3 model_xyz = vec3(world_to_model * vec4(cgto_center, 1.0));
-            vec3   d = clamp(model_xyz, model_aabb_min, model_aabb_max) - model_xyz;
-            float d2 = dot(d, d);
+        // Reserve contiguous space for this CGTO's PGTOs
+        uint dst = atomicAdd(num_pgtos, src_len);
 
-            // Copy PGTOs
-            for (uint k = 0; k < src_len; ++k) {
-                float r = pgto_radius[src_beg + k];
-                // Cull based on radius
-                if (d2 < r * r) {
-                    PGTO g;
-                    g.coeff = pgto_coeff[src_beg + k];
-                    g.alpha = pgto_alpha[src_beg + k];
-                    g.ijkl  = pgto_ijkl[src_beg + k];
-                    pgtos_tile[cgto_pgto_off + cgto_pgto_len] = g;
-                    cgto_pgto_len++;
-                }
+        cgto_center = cgto_xyzr[global_cgto_idx].xyz;
+        cgto_pgto_off = dst;
+        
+        vec3 model_xyz = vec3(world_to_model * vec4(cgto_center, 1.0));
+        vec3   d = clamp(model_xyz, model_aabb_min, model_aabb_max) - model_xyz;
+        float d2 = dot(d, d);
+
+        // Copy PGTOs
+        for (uint k = 0; k < src_len; ++k) {
+            float r = pgto_radius[src_beg + k];
+            // Cull based on radius
+            if (d2 < r * r) {
+                PGTO g;
+                g.coeff = pgto_coeff[src_beg + k];
+                g.alpha = pgto_alpha[src_beg + k];
+                g.ijkl  = pgto_ijkl[src_beg + k];
+                pgtos_tile[cgto_pgto_off + cgto_pgto_len] = g;
+                cgto_pgto_len++;
             }
         }
     }
@@ -205,16 +209,33 @@ void eval_phis(out float out_phi[TILE_SIZE], vec3 coord) {
         uint pgto_len = cgtos_tile[i].pgto_len;
 
         vec3 center = cgtos_tile[i].coord;
-        vec3 d  = coord - center;
+        vec3 d   = coord - center;
         float r2 = dot(d, d);
-        
+
+        // Precompute powers once per CGTO
+        float dx2 = d.x * d.x;
+        float dy2 = d.y * d.y;
+        float dz2 = d.z * d.z;
+
         float phi = 0.0;
         for (uint j = pgto_off; j < pgto_off + pgto_len; ++j) {
             PGTO pgto = pgtos_tile[j];
             uvec4 ijkl = unpack_ijkl(pgto.ijkl);
-            float fx = safe_pow(d.x, ijkl.x);
-            float fy = safe_pow(d.y, ijkl.y);
-            float fz = safe_pow(d.z, ijkl.z);
+            // Use ternary for common cases (no divergence, compiles to select)
+            float fx = (ijkl.x == 0u) ? 1.0 : 
+                       (ijkl.x == 1u) ? d.x :
+                       (ijkl.x == 2u) ? dx2 :
+                       (ijkl.x == 3u) ? dx2 * d.x : dx2 * dx2;
+            
+            float fy = (ijkl.y == 0u) ? 1.0 : 
+                       (ijkl.y == 1u) ? d.y :
+                       (ijkl.y == 2u) ? dy2 :
+                       (ijkl.y == 3u) ? dy2 * d.y : dy2 * dy2;
+            
+            float fz = (ijkl.z == 0u) ? 1.0 : 
+                       (ijkl.z == 1u) ? d.z :
+                       (ijkl.z == 2u) ? dz2 :
+                       (ijkl.z == 3u) ? dz2 * d.z : dz2 * dz2;
             phi += pgto.coeff * fx * fy * fz * exp(-pgto.alpha * r2);
         }
         out_phi[i] = phi;
@@ -225,9 +246,10 @@ float symmetric_contraction(float phi[TILE_SIZE], float D[TILE_SIZE][TILE_SIZE])
     float result = 0.0;
     for (uint i = 0; i < TILE_SIZE; ++i) {
         float ai = phi[i];
-        result += D[i][i] * ai * ai;                        // Diagonal
+        result = fma(D[i][i] * ai, ai, result);                     // Diagonal
+        ai = 2.0 * ai;
         for (uint j = i + 1; j < TILE_SIZE; ++j) {
-            result += 2.0 * D[i][j] * ai * phi[j];          // Off-diagonal
+            result = fma(D[i][j] * ai, phi[j], result);             // Off-diagonal
         }
     }
     return result;
@@ -236,15 +258,15 @@ float symmetric_contraction(float phi[TILE_SIZE], float D[TILE_SIZE][TILE_SIZE])
 float contraction(float phi_mu[TILE_SIZE], float phi_nu[TILE_SIZE], float D[TILE_SIZE][TILE_SIZE]) {
     float result = 0.0;
     for (uint i = 0; i < TILE_SIZE; ++i) {
-        float ai = phi_mu[i];
+        float ai = 2.0 * phi_mu[i];
         for (uint j = 0; j < TILE_SIZE; ++j) {
-            result += 2.0 * D[i][j] * ai * phi_nu[j];
+            result = fma(D[i][j] * ai, phi_nu[j], result);          // Off-diagonal
         }
     }
     return result;
 }
 
-layout (local_size_x = 8, local_size_y = 8, local_size_z = 8) in;
+layout (local_size_x = WG_X, local_size_y = WG_Y, local_size_z = WG_Z) in;
 void main() {
     uint tid = gl_LocalInvocationIndex;
     if (tid == 0) {
@@ -256,7 +278,6 @@ void main() {
     vec3 model_aabb_max = vec3((gl_WorkGroupID.xyz + uvec3(1,1,1)) * gl_WorkGroupSize.xyz) * step.xyz;
     // Stage 1: Screening. Prune CGTOs to identify which are relevant for region
     {
-        // Stream matches directly; avoid large per-thread stacks and subgroup prefix sums
         for (uint i = tid; i < D_matrix_dim; i += WG_SIZE) {
             vec4 cgto = cgto_xyzr[i];
             if (cgto.w == 0.0) continue;
@@ -291,6 +312,7 @@ void main() {
                 screened_cgtos[i] = INVALID_CGTO_IDX;
             }
         }
+        barrier();
     }
 
     float phi_tile_mu[TILE_SIZE]; // evaluated φ_μ(r) in registers
@@ -330,8 +352,6 @@ void main() {
         rho += symmetric_contraction(phi_tile_mu, D_tile);
 
         for (uint tile_j = tile_i + 1; tile_j < num_tiles; ++tile_j) {
-            barrier();
-
             // OFF DIAGONAL TILE
             if (tid == 0) {
                 num_pgtos = 0;
diff --git a/src/shaders/eval_gto_density_grad.comp b/src/shaders/eval_gto_density_grad.comp
diff --git a/src/shaders/voronoi_segment.comp b/src/shaders/voronoi_segment.comp

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,8 @@ set(GTO_SHADER_FILES`
`101`	`101`	`src/shaders/eval_alie.comp`
`102`	`102`	`src/shaders/eval_gto.comp`
`103`	`103`	`src/shaders/eval_gto_density.comp`
`104`		`- src/shaders/segment_and_attribute_to_group.comp`
	`104`	`+ src/shaders/eval_gto_density_grad.comp`
	`105`	`+ src/shaders/voronoi_segment.comp`
`105`	`106`	`)`
`106`	`107`
`107`	`108`	`set(MD_DEFINES MD_GL_SPLINE_SUBDIVISION_COUNT=${MD_GL_SPLINE_SUBDIVISION_COUNT})`