@@ -4,7 +4,7 @@ use anyhow::bail;
44use faer:: linalg:: triangular_solve:: solve_lower_triangular_in_place;
55use faer:: linalg:: triangular_solve:: solve_upper_triangular_in_place;
66use faer:: { Col , Mat , Row } ;
7- use rayon :: prelude :: * ;
7+
88/// Applies Burke's Interior Point Method (IPM) to solve a convex optimization problem.
99///
1010/// The objective function to maximize is:
@@ -93,14 +93,14 @@ pub fn burke(psi: &Psi) -> anyhow::Result<(Weights, f64)> {
9393
9494 let mut psi_inner: Mat < f64 > = Mat :: zeros ( psi. nrows ( ) , psi. ncols ( ) ) ;
9595
96- let n_threads = faer:: get_global_parallelism ( ) . degree ( ) ;
97-
9896 let rows = psi. nrows ( ) ;
9997
100- let mut output: Vec < Mat < f64 > > = ( 0 ..n_threads) . map ( |_| Mat :: zeros ( rows, rows) ) . collect ( ) ;
101-
10298 let mut h: Mat < f64 > = Mat :: zeros ( rows, rows) ;
10399
100+ // Cache-size threshold: prefer sequential for small matrices to avoid thread overhead
101+ // For larger matrices, use faer's built-in parallelism which has better cache behavior
102+ const PARALLEL_THRESHOLD : usize = 512 ;
103+
104104 while mu > eps || norm_r > eps || gap > eps {
105105 let smu = sig * mu;
106106 // inner = lam ./ y, elementwise.
@@ -109,46 +109,32 @@ pub fn burke(psi: &Psi) -> anyhow::Result<(Weights, f64)> {
109109 let w_plam = Col :: from_fn ( plam. nrows ( ) , |i| plam. get ( i) / w. get ( i) ) ;
110110
111111 // Scale each column of psi by the corresponding element of 'inner'
112-
113- if psi. ncols ( ) > n_threads * 128 {
114- psi_inner
115- . par_col_partition_mut ( n_threads)
116- . zip ( psi. par_col_partition ( n_threads) )
117- . zip ( inner. par_partition ( n_threads) )
118- . zip ( output. par_iter_mut ( ) )
119- . for_each ( |( ( ( mut psi_inner, psi) , inner) , output) | {
120- psi_inner
121- . as_mut ( )
122- . col_iter_mut ( )
123- . zip ( psi. col_iter ( ) )
124- . zip ( inner. iter ( ) )
125- . for_each ( |( ( col, psi_col) , inner_val) | {
126- col. iter_mut ( ) . zip ( psi_col. iter ( ) ) . for_each ( |( x, psi_val) | {
127- * x = psi_val * inner_val;
128- } ) ;
129- } ) ;
130- faer:: linalg:: matmul:: triangular:: matmul (
131- output. as_mut ( ) ,
132- faer:: linalg:: matmul:: triangular:: BlockStructure :: TriangularLower ,
133- faer:: Accum :: Replace ,
134- & psi_inner,
135- faer:: linalg:: matmul:: triangular:: BlockStructure :: Rectangular ,
136- psi. transpose ( ) ,
137- faer:: linalg:: matmul:: triangular:: BlockStructure :: Rectangular ,
138- 1.0 ,
139- faer:: Par :: Seq ,
140- ) ;
112+ // Use sequential column scaling - cache-friendly access pattern
113+ psi_inner
114+ . as_mut ( )
115+ . col_iter_mut ( )
116+ . zip ( psi. col_iter ( ) )
117+ . zip ( inner. iter ( ) )
118+ . for_each ( |( ( col, psi_col) , inner_val) | {
119+ col. iter_mut ( ) . zip ( psi_col. iter ( ) ) . for_each ( |( x, psi_val) | {
120+ * x = psi_val * inner_val;
141121 } ) ;
122+ } ) ;
142123
143- let mut first_iter = true ;
144- for output in & output {
145- if first_iter {
146- h. copy_from ( output) ;
147- first_iter = false ;
148- } else {
149- h += output;
150- }
151- }
124+ // Use faer's built-in parallelism for matmul - it has better cache tiling
125+ // than our manual partitioning which caused false sharing
126+ if psi. ncols ( ) > PARALLEL_THRESHOLD {
127+ faer:: linalg:: matmul:: triangular:: matmul (
128+ h. as_mut ( ) ,
129+ faer:: linalg:: matmul:: triangular:: BlockStructure :: TriangularLower ,
130+ faer:: Accum :: Replace ,
131+ & psi_inner,
132+ faer:: linalg:: matmul:: triangular:: BlockStructure :: Rectangular ,
133+ psi. transpose ( ) ,
134+ faer:: linalg:: matmul:: triangular:: BlockStructure :: Rectangular ,
135+ 1.0 ,
136+ faer:: Par :: rayon ( 0 ) , // Let faer handle parallelism with proper cache tiling
137+ ) ;
152138 } else {
153139 psi_inner
154140 . as_mut ( )
0 commit comments