Added SUMMA tests and fixed dtype problem

astroC86 · astroC86 · commit a994192d5d92 · 2025-07-26T21:08:47.000+02:00
diff --git a/examples/plot_matrixmult.py b/examples/plot_matrixmult.py
@@ -28,6 +28,7 @@
 
 import pylops_mpi
 from pylops_mpi import Partition
+from pylops_mpi.basicoperators.MatrixMult import active_grid_comm, MPIMatrixMult
 
 plt.close("all")
 
@@ -88,8 +89,7 @@
 # than the row or columm ranks.
 
 base_comm = MPI.COMM_WORLD
-comm, rank, row_id, col_id, is_active = \
-    pylops_mpi.MPIMatrixMult.active_grid_comm(base_comm, N, M)
+comm, rank, row_id, col_id, is_active = active_grid_comm(base_comm, N, M)
 print(f"Process {base_comm.Get_rank()} is {'active' if is_active else 'inactive'}")
 if not is_active: exit(0)
 
@@ -147,7 +147,7 @@
 ################################################################################
 # We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
 # operator and the input matrix :math:`\mathbf{X}`
-Aop = pylops_mpi.MPIMatrixMult(A_p, M, base_comm=comm, dtype="float32")
+Aop = MPIMatrixMult(A_p, M, base_comm=comm, dtype="float32", kind="block")
 
 col_lens = comm.allgather(my_own_cols)
 total_cols = np.sum(col_lens)
diff --git a/examples/plot_summamatrixmult.py b/examples/plot_summamatrixmult.py
@@ -1,11 +1,28 @@
+r"""
+Distributed SUMMA Matrix Multiplication
+=======================================
+This example shows how to use the :py:class:`pylops_mpi.basicoperators.MPISummaMatrixMult`
+operator to perform matrix-matrix multiplication between a matrix :math:`\mathbf{A}`
+distributed in 2D blocks across a square process grid and matrices :math:`\mathbf{X}`
+and :math:`\mathbf{Y}` distributed in 2D blocks across the same grid. Similarly,
+the adjoint operation can be performed with a matrix :math:`\mathbf{Y}` distributed
+in the same fashion as matrix :math:`\mathbf{X}`.
+
+Note that whilst the different blocks of matrix :math:`\mathbf{A}` are directly
+stored in the operator on different ranks, the matrices :math:`\mathbf{X}` and
+:math:`\mathbf{Y}` are effectively represented by 1-D :py:class:`pylops_mpi.DistributedArray`
+objects where the different blocks are flattened and stored on different ranks.
+Note that to optimize communications, the ranks are organized in a square grid and
+blocks of :math:`\mathbf{A}` and :math:`\mathbf{X}` are systematically broadcast
+across different ranks during computation - see below for details.
+"""
+
 import math
 import numpy as np
 from mpi4py import MPI
 
 import pylops_mpi
-from pylops_mpi.basicoperators.MatrixMult import (local_block_spit,
-                                                     block_gather,
-                                                     MPISummaMatrixMult)
+from pylops_mpi.basicoperators.MatrixMult import (local_block_spit, block_gather, MPIMatrixMult)
 
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
@@ -16,43 +33,40 @@
 K = 9
 
 A_shape = (N, K)
-B_shape = (K, M)
-C_shape = (N, M)
+x_shape = (K, M)
+y_shape = (N, M)
 
 p_prime = math.isqrt(size)
-assert p_prime * p_prime == size, "Number of processes must be a perfect square"
-
 A_data = np.arange(int(A_shape[0] * A_shape[1])).reshape(A_shape)
-B_data = np.arange(int(B_shape[0] * B_shape[1])).reshape(B_shape)
+x_data = np.arange(int(x_shape[0] * x_shape[1])).reshape(x_shape)
 
 A_slice = local_block_spit(A_shape, rank, comm)
-B_slice = local_block_spit(B_shape, rank, comm)
+x_slice = local_block_spit(x_shape, rank, comm)
 A_local = A_data[A_slice]
-B_local = B_data[B_slice]
-# A_local, (N_new, K_new) = block_distribute(A_data,rank, comm)
-# B_local, (K_new, M_new) = block_distribute(B_data,rank, comm)
+x_local = x_data[x_slice]
 
-B_dist = pylops_mpi.DistributedArray(global_shape=(K * M),
-                                     local_shapes=comm.allgather(B_local.shape[0] * B_local.shape[1]),
+x_dist = pylops_mpi.DistributedArray(global_shape=(K * M),
+                                     local_shapes=comm.allgather(x_local.shape[0] * x_local.shape[1]),
                                      base_comm=comm,
-                                     partition=pylops_mpi.Partition.SCATTER)
-B_dist.local_array[:] = B_local.flatten()
+                                     partition=pylops_mpi.Partition.SCATTER,
+                                     dtype=x_local.dtype)
+x_dist.local_array[:] = x_local.flatten()
 
-Aop = MPISummaMatrixMult(A_local, M, base_comm=comm)
-C_dist = Aop @ B_dist
-Z_dist = Aop.H @ C_dist
+Aop = MPIMatrixMult(A_local, M, base_comm=comm, kind="summa", dtype=A_local.dtype)
+y_dist = Aop @ x_dist
+xadj_dist = Aop.H @ y_dist
 
-C = block_gather(C_dist, (N,M), (N,M), comm)
-Z = block_gather(Z_dist, (K,M), (K,M), comm)
+y = block_gather(y_dist, (N,M), (N,M), comm)
+xadj = block_gather(xadj_dist, (K,M), (K,M), comm)
 if rank == 0 :
-    C_correct = np.allclose(A_data @ B_data, C)
-    print("C expected: ", C_correct)
-    if not C_correct:
-        print("expected:\n", A_data @ B_data)
-        print("calculated:\n",C)
-
-    Z_correct = np.allclose((A_data.T.dot((A_data @ B_data).conj())).conj(), Z.astype(np.int32))
-    print("Z expected: ", Z_correct)
-    if not Z_correct:
-        print("expected:\n", (A_data.T.dot((A_data @ B_data).conj())).conj())
-        print("calculated:\n", Z.astype(np.int32))
+    y_correct = np.allclose(A_data @ x_data, y)
+    print("y expected: ", y_correct)
+    if not y_correct:
+        print("expected:\n", A_data @ x_data)
+        print("calculated:\n",y)
+
+    xadj_correct = np.allclose((A_data.T.dot((A_data @ x_data).conj())).conj(), xadj.astype(np.int32))
+    print("xadj expected: ", xadj_correct)
+    if not xadj_correct:
+        print("expected:\n", (A_data.T.dot((A_data @ x_data).conj())).conj())
+        print("calculated:\n", xadj.astype(np.int32))
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -74,7 +74,7 @@ def active_grid_comm(base_comm: MPI.Comm, N: int, M: int):
 def local_block_spit(global_shape: Tuple[int, int],
                      rank: int,
                      comm: MPI.Comm) -> Tuple[slice, slice]:
-    """
+    r"""
     Compute the local sub‐block of a 2D global array for a process in a square process grid.
 
     Parameters
@@ -122,7 +122,7 @@ def local_block_spit(global_shape: Tuple[int, int],
 
 
 def block_gather(x: DistributedArray, new_shape: Tuple[int, int], orig_shape: Tuple[int, int], comm: MPI.Comm):
-    """
+    r"""
     Gather distributed local blocks from 2D block distributed matrix distributed
     amongst a square process grid into the full global array.
 
@@ -351,19 +351,19 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER} Got {x.partition} instead...")
-
+        output_dtype = np.result_type(self.dtype, x.dtype)
         y = DistributedArray(
             global_shape=(self.N * self.dimsd[1]),
             local_shapes=[(self.N * c) for c in self._rank_col_lens],
             mask=x.mask,
             partition=Partition.SCATTER,
-            dtype=self.dtype,
+            dtype=output_dtype,
             base_comm=self.base_comm
         )
 
         my_own_cols = self._rank_col_lens[self.rank]
         x_arr = x.local_array.reshape((self.dims[0], my_own_cols))
-        X_local = x_arr.astype(self.dtype)
+        X_local = x_arr.astype(output_dtype)
         Y_local = ncp.vstack(
             self._row_comm.allgather(
                 ncp.matmul(self.A, X_local)
@@ -377,16 +377,28 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER}. Got {x.partition} instead.")
 
+        # - If A is real: A^H = A^T,
+        #       so result_type(real_A, x.dtype) = x.dtype (if x is complex) or real (if x is real)
+        # - If A is complex: A^H is complex,
+        #       so result will be complex regardless of x
+        if np.iscomplexobj(self.A):
+            output_dtype = np.result_type(self.dtype, x.dtype)
+        else:
+            # Real matrix: A^T @ x preserves input type complexity
+            output_dtype = x.dtype if np.iscomplexobj(x.local_array) else self.dtype
+            # But still need to check type promotion for precision
+            output_dtype = np.result_type(self.dtype, output_dtype)
+
         y = DistributedArray(
             global_shape=(self.K * self.dimsd[1]),
             local_shapes=[self.K * c for c in self._rank_col_lens],
             mask=x.mask,
             partition=Partition.SCATTER,
-            dtype=self.dtype,
+            dtype=output_dtype,
             base_comm=self.base_comm
         )
 
-        x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(self.dtype)
+        x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(output_dtype)
         X_tile = x_arr[self._row_start:self._row_end, :]
         A_local = self.At if hasattr(self, "At") else self.A.T.conj()
         Y_local = ncp.matmul(A_local, X_tile)
@@ -536,7 +548,6 @@ def __init__(
         self._col_comm = base_comm.Split(color=self._col_id, key=self._row_id)
 
         self.A = A.astype(np.dtype(dtype))
-        if saveAt: self.At = A.T.conj()
 
         self.N = self._col_comm.allreduce(A.shape[0])
         self.K = self._row_comm.allreduce(A.shape[1])
@@ -569,6 +580,7 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER} Got {x.partition} instead...")
 
+        output_dtype = np.result_type(self.dtype, x.dtype)
         # Calculate local shapes for block distribution
         bn = self._N_padded // self._P_prime  # block size in N dimension
         bm = self._M_padded // self._P_prime  # block size in M dimension
@@ -582,9 +594,8 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
                              mask=x.mask,
                              local_shapes=local_shapes,
                              partition=Partition.SCATTER,
-                             dtype=self.dtype,
-                             base_comm=self.base_comm
-                             )
+                             dtype=output_dtype,
+                             base_comm=self.base_comm)
 
         # Calculate expected padded dimensions for x
         bk = self._K_padded // self._P_prime  # block size in K dimension
@@ -603,13 +614,13 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         if pad_k > 0 or pad_m > 0:
             x_block = np.pad(x_block, [(0, pad_k), (0, pad_m)], mode='constant')
 
-        Y_local = np.zeros((self.A.shape[0], bm))
+        Y_local = np.zeros((self.A.shape[0], bm),dtype=output_dtype)
 
         for k in range(self._P_prime):
             Atemp = self.A.copy() if self._col_id == k else np.empty_like(self.A)
             Xtemp = x_block.copy() if self._row_id == k else np.empty_like(x_block)
-            self._row_comm.bcast(Atemp, root=k)
-            self._col_comm.bcast(Xtemp, root=k)
+            self._row_comm.Bcast(Atemp, root=k)
+            self._col_comm.Bcast(Xtemp, root=k)
             Y_local += ncp.dot(Atemp, Xtemp)
 
         Y_local_unpadded = Y_local[:local_n, :local_m]
@@ -631,13 +642,24 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         local_m = bm if self._col_id != self._P_prime - 1 else self.M - (self._P_prime - 1) * bm
 
         local_shapes = self.base_comm.allgather(local_k * local_m)
+        # - If A is real: A^H = A^T,
+        #       so result_type(real_A, x.dtype) = x.dtype (if x is complex) or real (if x is real)
+        # - If A is complex: A^H is complex,
+        #       so result will be complex regardless of x
+        if np.iscomplexobj(self.A):
+            output_dtype = np.result_type(self.dtype, x.dtype)
+        else:
+            # Real matrix: A^T @ x preserves input type complexity
+            output_dtype = x.dtype if np.iscomplexobj(x.local_array) else self.dtype
+            # But still need to check type promotion for precision
+            output_dtype = np.result_type(self.dtype, output_dtype)
 
         y = DistributedArray(
             global_shape=(self.K * self.M),
             mask=x.mask,
             local_shapes=local_shapes,
             partition=Partition.SCATTER,
-            dtype=self.dtype,
+            dtype=output_dtype,
             base_comm=self.base_comm
         )
 
@@ -659,7 +681,7 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
             x_block = np.pad(x_block, [(0, pad_n), (0, pad_m)], mode='constant')
 
         A_local = self.At if hasattr(self, "At") else self.A.T.conj()
-        Y_local = np.zeros((self.A.shape[1], bm))
+        Y_local = np.zeros((self.A.shape[1], bm), dtype=output_dtype)
 
         for k in range(self._P_prime):
             requests = []
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py