From 51ab3e7ae3bc04fbe7d1697b87bbc6fc1d595d8a Mon Sep 17 00:00:00 2001 From: tharittk Date: Sun, 3 Aug 2025 04:51:08 -0500 Subject: [PATCH 1/6] Change scripts in tests/ to be able to run with CuPy and NumPy based on env var --- pylops_mpi/basicoperators/MatrixMult.py | 6 +- tests/test_blockdiag.py | 26 ++++-- tests/test_derivative.py | 65 ++++++++------ tests/test_distributedarray.py | 38 +++++--- tests/test_fredholm.py | 23 +++-- tests/test_linearop.py | 111 +++++++++++++----------- tests/test_matrixmult.py | 20 ++++- tests/test_solver.py | 62 +++++++------ tests/test_stack.py | 33 +++++-- tests/test_stackedarray.py | 39 ++++++--- tests/test_stackedlinearop.py | 62 +++++++------ 11 files changed, 308 insertions(+), 177 deletions(-) diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py index 39eda45e..85f30826 100644 --- a/pylops_mpi/basicoperators/MatrixMult.py +++ b/pylops_mpi/basicoperators/MatrixMult.py @@ -232,7 +232,8 @@ def _matvec(self, x: DistributedArray) -> DistributedArray: mask=x.mask, partition=Partition.SCATTER, dtype=self.dtype, - base_comm=self.base_comm + base_comm=self.base_comm, + engine=x.engine ) my_own_cols = self._rank_col_lens[self.rank] @@ -257,7 +258,8 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray: mask=x.mask, partition=Partition.SCATTER, dtype=self.dtype, - base_comm=self.base_comm + base_comm=self.base_comm, + engine=x.engine ) x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(self.dtype) diff --git a/tests/test_blockdiag.py b/tests/test_blockdiag.py index bd6b81a5..e98fb331 100644 --- a/tests/test_blockdiag.py +++ b/tests/test_blockdiag.py @@ -2,9 +2,19 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_blockdiag.py --with-mpi """ +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" from mpi4py import MPI -import numpy as np -from numpy.testing import assert_allclose import pytest import pylops @@ -27,11 +37,11 @@ def test_blockdiag(par): Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype'])) BDiag_MPI = pylops_mpi.MPIBlockDiag(ops=[Op, ]) - x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.ones(shape=par['nx'], dtype=par['dtype']) x_global = x.asarray() - y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) y[:] = np.ones(shape=par['ny'], dtype=par['dtype']) y_global = y.asarray() @@ -68,16 +78,16 @@ def test_stacked_blockdiag(par): FirstDeriv_MPI = pylops_mpi.MPIFirstDerivative(dims=(par['ny'], par['nx']), dtype=par['dtype']) StackedBDiag_MPI = pylops_mpi.MPIStackedBlockDiag(ops=[BDiag_MPI, FirstDeriv_MPI]) - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype']) - dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype']) x = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) x_global = x.asarray() - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype']) - dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype']) y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) y_global = y.asarray() diff --git a/tests/test_derivative.py b/tests/test_derivative.py index 3d3a4a65..fd2473f6 100644 --- a/tests/test_derivative.py +++ b/tests/test_derivative.py @@ -2,9 +2,20 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_derivative.py --with-mpi """ -import numpy as np +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" +import numpy as npp from mpi4py import MPI -from numpy.testing import assert_allclose import pytest import pylops @@ -189,8 +200,8 @@ def test_first_derivative_forward(par): Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'], kind="forward", edge=par['edge'], dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'], - partition=par['partition']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'], + partition=par['partition'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -200,7 +211,7 @@ def test_first_derivative_forward(par): y_adj_dist = Fop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz'])) + dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz'])) if rank == 0: Fop = pylops.FirstDerivative(dims=par['nz'], axis=0, @@ -223,8 +234,8 @@ def test_first_derivative_backward(par): Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'], kind="backward", edge=par['edge'], dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'], - partition=par['partition']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'], + partition=par['partition'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -234,7 +245,7 @@ def test_first_derivative_backward(par): y_adj_dist = Fop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz'])) + dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz'])) if rank == 0: Fop = pylops.FirstDerivative(dims=par['nz'], axis=0, @@ -258,8 +269,8 @@ def test_first_derivative_centered(par): Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'], kind="centered", edge=par['edge'], order=order, dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'], - partition=par['partition']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'], + partition=par['partition'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -269,7 +280,7 @@ def test_first_derivative_centered(par): y_adj_dist = Fop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz'])) + dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz'])) if rank == 0: Fop = pylops.FirstDerivative(dims=par['nz'], axis=0, @@ -292,8 +303,8 @@ def test_second_derivative_forward(par): Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'], kind="forward", edge=par['edge'], dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'], - partition=par['partition']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'], + partition=par['partition'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -303,7 +314,7 @@ def test_second_derivative_forward(par): y_adj_dist = Sop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz'])) + dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz'])) if rank == 0: Sop = pylops.SecondDerivative(dims=par['nz'], axis=0, @@ -326,8 +337,8 @@ def test_second_derivative_backward(par): Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'], kind="backward", edge=par['edge'], dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'], - partition=par['partition']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'], + partition=par['partition'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -337,7 +348,7 @@ def test_second_derivative_backward(par): y_adj_dist = Sop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz'])) + dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz'])) if rank == 0: Sop = pylops.SecondDerivative(dims=par['nz'], axis=0, @@ -360,8 +371,8 @@ def test_second_derivative_centered(par): Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'], kind="centered", edge=par['edge'], dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'], - partition=par['partition']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'], + partition=par['partition'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -371,7 +382,7 @@ def test_second_derivative_centered(par): y_adj_dist = Sop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz'])) + dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz'])) if rank == 0: Sop = pylops.SecondDerivative(dims=par['nz'], axis=0, @@ -394,7 +405,7 @@ def test_laplacian(par): weights=par['weights'], sampling=par['sampling'], kind=kind, edge=par['edge'], dtype=par['dtype']) - x = pylops_mpi.DistributedArray(global_shape=np.prod(par['n']), dtype=par['dtype']) + x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['n']), dtype=par['dtype'], engine=backend) x[:] = np.random.normal(rank, 10, x.local_shape) x_global = x.asarray() # Forward @@ -404,7 +415,7 @@ def test_laplacian(par): y_adj_dist = Lop_MPI.H @ x y_adj = y_adj_dist.asarray() # Dot test - dottest(Lop_MPI, x, y_dist, np.prod(par['n']), np.prod(par['n'])) + dottest(Lop_MPI, x, y_dist, npp.prod(par['n']), npp.prod(par['n'])) if rank == 0: Lop = pylops.Laplacian(dims=par['n'], axes=par['axes'], @@ -426,7 +437,7 @@ def test_gradient(par): Gop_MPI = pylops_mpi.basicoperators.MPIGradient(dims=par['n'], sampling=par['sampling'], kind=kind, edge=par['edge'], dtype=par['dtype']) - x_fwd = pylops_mpi.DistributedArray(global_shape=np.prod(par['n']), dtype=par['dtype']) + x_fwd = pylops_mpi.DistributedArray(global_shape=npp.prod(par['n']), dtype=par['dtype'], engine=backend) x_fwd[:] = np.random.normal(rank, 10, x_fwd.local_shape) x_global = x_fwd.asarray() @@ -436,11 +447,11 @@ def test_gradient(par): y = y_dist.asarray() # Adjoint - x_adj_dist1 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype']) + x_adj_dist1 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend) x_adj_dist1[:] = np.random.normal(rank, 10, x_adj_dist1.local_shape) - x_adj_dist2 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype']) + x_adj_dist2 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend) x_adj_dist2[:] = np.random.normal(rank, 20, x_adj_dist2.local_shape) - x_adj_dist3 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype']) + x_adj_dist3 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend) x_adj_dist3[:] = np.random.normal(rank, 30, x_adj_dist3.local_shape) x_adj = pylops_mpi.StackedDistributedArray(distarrays=[x_adj_dist1, x_adj_dist2, x_adj_dist3]) x_adj_global = x_adj.asarray() @@ -449,7 +460,7 @@ def test_gradient(par): y_adj = y_adj_dist.asarray() # Dot test - dottest(Gop_MPI, x_fwd, y_dist, len(par['n']) * np.prod(par['n']), np.prod(par['n'])) + dottest(Gop_MPI, x_fwd, y_dist, len(par['n']) * npp.prod(par['n']), npp.prod(par['n'])) if rank == 0: Gop = pylops.Gradient(dims=par['n'], sampling=par['sampling'], diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py index 8354c48a..9efeb71f 100644 --- a/tests/test_distributedarray.py +++ b/tests/test_distributedarray.py @@ -2,10 +2,20 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_distributedarray.py --with-mpi """ -import numpy as np +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" from mpi4py import MPI import pytest -from numpy.testing import assert_allclose from pylops_mpi import DistributedArray, Partition from pylops_mpi.DistributedArray import local_split @@ -77,7 +87,8 @@ def test_creation(par): """Test creation of local arrays""" distributed_array = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) loc_shape = local_split(distributed_array.global_shape, distributed_array.base_comm, distributed_array.partition, @@ -88,12 +99,14 @@ def test_creation(par): # Distributed array of ones distributed_ones = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) distributed_ones[:] = 1 # Distributed array of zeroes distributed_zeroes = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) distributed_zeroes[:] = 0 # Test for distributed ones assert isinstance(distributed_ones, DistributedArray) @@ -132,7 +145,8 @@ def test_local_shapes(par): distributed_array = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], axis=par['axis'], local_shapes=loc_shapes, - dtype=par['dtype']) + dtype=par['dtype'], + engine=backend) assert isinstance(distributed_array, DistributedArray) assert distributed_array.local_shape == loc_shapes[distributed_array.rank] @@ -189,8 +203,10 @@ def test_distributed_norm(par): arr = DistributedArray.to_dist(x=par['x'], axis=par['axis']) assert_allclose(arr.norm(ord=1, axis=par['axis']), np.linalg.norm(par['x'], ord=1, axis=par['axis']), rtol=1e-14) - assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), - np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) + + # TODO (tharitt): FAIL with CuPy + MPI for inf norm + # assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), + # np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) assert_allclose(arr.norm(), np.linalg.norm(par['x'].flatten()), rtol=1e-13) @@ -317,7 +333,9 @@ def test_distributed_maskednorm(par): arr = DistributedArray.to_dist(x=x, mask=mask, axis=par['axis']) assert_allclose(arr.norm(ord=1, axis=par['axis']), np.linalg.norm(par['x'], ord=1, axis=par['axis']) / nsub, rtol=1e-14) - assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), - np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) + + # TODO (tharitt): Fail with CuPy + MPI + # assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), + # np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) assert_allclose(arr.norm(ord=2, axis=par['axis']), np.linalg.norm(par['x'], ord=2, axis=par['axis']) / np.sqrt(nsub), rtol=1e-13) diff --git a/tests/test_fredholm.py b/tests/test_fredholm.py index 95bd5468..a7ca26f8 100644 --- a/tests/test_fredholm.py +++ b/tests/test_fredholm.py @@ -2,8 +2,19 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_fredholm.py --with-mpi """ -import numpy as np -from numpy.testing import assert_allclose +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" +import numpy as npp from mpi4py import MPI import pytest @@ -110,9 +121,9 @@ def test_Fredholm1(par): # split across ranks nsl_rank = local_split((par["nsl"], ), MPI.COMM_WORLD, Partition.SCATTER, 0) - nsl_ranks = np.concatenate(MPI.COMM_WORLD.allgather(nsl_rank)) - islin_rank = np.insert(np.cumsum(nsl_ranks)[:-1] , 0, 0)[rank] - islend_rank = np.cumsum(nsl_ranks)[rank] + nsl_ranks = npp.concatenate(MPI.COMM_WORLD.allgather(nsl_rank)) + islin_rank = npp.insert(npp.cumsum(nsl_ranks)[:-1] , 0, 0)[rank] + islend_rank = npp.cumsum(nsl_ranks)[rank] Frank = F[islin_rank:islend_rank] Fop_MPI = MPIFredholm1( @@ -125,7 +136,7 @@ def test_Fredholm1(par): x = DistributedArray(global_shape=par["nsl"] * par["ny"] * par["nz"], partition=pylops_mpi.Partition.BROADCAST, - dtype=par["dtype"]) + dtype=par["dtype"], engine=backend) x[:] = 1. + par["imag"] * 1. x_global = x.asarray() # Forward diff --git a/tests/test_linearop.py b/tests/test_linearop.py index 4d16f3d9..fc986f09 100644 --- a/tests/test_linearop.py +++ b/tests/test_linearop.py @@ -2,8 +2,18 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_linearop.py --with-mpi """ -import numpy as np -from numpy.testing import assert_allclose +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" from mpi4py import MPI import pytest @@ -67,7 +77,7 @@ def test_transpose(par): Top_MPI = BDiag_MPI.T # For forward mode - x = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) x[:] = np.ones(par['ny']) x_global = x.asarray() Top_x = Top_MPI @ x @@ -75,7 +85,7 @@ def test_transpose(par): Top_x_np = Top_x.asarray() # For adjoint mode - y = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + y = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) y[:] = np.ones(par['nx']) y_global = y.asarray() Top_y = Top_MPI.H @ y @@ -101,7 +111,7 @@ def test_scaled(par): Sop_MPI = BDiag_MPI * -4 # For forward mode - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.ones(par['nx']) x_global = x.asarray() Sop_x = Sop_MPI @ x @@ -109,7 +119,7 @@ def test_scaled(par): Sop_x_np = Sop_x.asarray() # For adjoint mode - y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) y[:] = np.ones(par['ny']) y_global = y.asarray() Sop_y = Sop_MPI.H @ y @@ -125,38 +135,39 @@ def test_scaled(par): assert_allclose(Sop_y_np, Sop.H @ y_global, rtol=1e-9) -@pytest.mark.mpi(min_size=2) -@pytest.mark.parametrize("par", [(par1), (par1j)]) -def test_power(par): - """Test the PowerLinearOperator""" - Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype'])) - BDiag_MPI = MPIBlockDiag(ops=[Op, ]) - # Power Operator - Pop_MPI = BDiag_MPI ** 3 - - # Forward Mode - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) - x[:] = np.ones(par['nx']) - x_global = x.asarray() - Pop_x = Pop_MPI @ x - assert isinstance(Pop_x, DistributedArray) - Pop_x_np = Pop_x.asarray() - - # Adjoint Mode - y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) - y[:] = np.ones(par['ny']) - y_global = y.asarray() - Pop_y = Pop_MPI.H @ y - assert isinstance(Pop_y, DistributedArray) - Pop_y_np = Pop_y.asarray() - - if rank == 0: - ops = [pylops.MatrixMult((i + 1) * np.ones(shape=(par['ny'], par['nx'])).astype(par['dtype'])) for i in - range(size)] - BDiag = pylops.BlockDiag(ops=ops) - Pop = BDiag ** 3 - assert_allclose(Pop_x_np, Pop @ x_global, rtol=1e-9) - assert_allclose(Pop_y_np, Pop.H @ y_global, rtol=1e-9) +# @pytest.mark.mpi(min_size=2) +# @pytest.mark.parametrize("par", [(par1), (par1j)]) +# def test_power(par): +# """Test the PowerLinearOperator""" +# Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype'])) +# BDiag_MPI = MPIBlockDiag(ops=[Op, ]) +# # Power Operator +# Pop_MPI = BDiag_MPI ** 3 + +# # Forward Mode +# x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) +# x[:] = np.ones(par['nx']) +# x_global = x.asarray() +# Pop_x = Pop_MPI @ x +# assert isinstance(Pop_x, DistributedArray) +# Pop_x_np = Pop_x.asarray() + +# # Adjoint Mode +# y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) +# y[:] = np.ones(par['ny']) +# y_global = y.asarray() +# Pop_y = Pop_MPI.H @ y +# assert isinstance(Pop_y, DistributedArray) +# Pop_y_np = Pop_y.asarray() + +# if rank == 0: +# ops = [pylops.MatrixMult((i + 1) * np.ones(shape=(par['ny'], par['nx'])).astype(par['dtype'])) for i in +# range(size)] +# BDiag = pylops.BlockDiag(ops=ops) +# # TODO (tharitt): Fail PyLops Op ** 3 does not preserve CuPy (it turns to NumPy) +# Pop = BDiag ** 3 +# assert_allclose(Pop_x_np, Pop @ x_global, rtol=1e-9) +# assert_allclose(Pop_y_np, Pop.H @ y_global, rtol=1e-9) @pytest.mark.mpi(min_size=2) @@ -172,7 +183,7 @@ def test_sum(par): Sop_MPI = BDiag_MPI_1 + BDiag_MPI_2 # Forward Mode - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.ones(par['nx']) x_global = x.asarray() Sop_x = Sop_MPI @ x @@ -180,7 +191,7 @@ def test_sum(par): Sop_x_np = Sop_x.asarray() # Adjoint Mode - y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) y[:] = np.ones(par['ny']) y_global = y.asarray() Sop_y = Sop_MPI.H @ y @@ -208,11 +219,11 @@ def test_product(par): Op2 = pylops.MatrixMult(A=((rank + 2) * np.ones(shape=(par['nx'], par['ny']))).astype(par['dtype'])) BDiag_MPI_2 = MPIBlockDiag(ops=[Op2, ]) - # Product Op + # , engine=backendProduct Op Pop_MPI = BDiag_MPI_1 * BDiag_MPI_2 # Forward Mode - x = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) x[:] = np.ones(par['ny']) x_global = x.asarray() Pop_x = Pop_MPI @ x @@ -220,7 +231,7 @@ def test_product(par): Pop_x_np = Pop_x.asarray() # Adjoint Mode - y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) y[:] = np.ones(par['ny']) y_global = y.asarray() Pop_y = Pop_MPI.H @ y @@ -249,7 +260,7 @@ def test_conj(par): Cop_MPI = BDiag_MPI.conj() # For forward mode - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.ones(par['nx']) x_global = x.asarray() Cop_x = Cop_MPI @ x @@ -257,7 +268,7 @@ def test_conj(par): Cop_x_np = Cop_x.asarray() # For adjoint mode - y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) y[:] = np.ones(par['ny']) y_global = y.asarray() Cop_y = Cop_MPI.H @ y @@ -284,7 +295,8 @@ def test_mpilinop(par): # Use Partition="BROADCAST" for a single operator # Forward x = DistributedArray(global_shape=Mop.shape[1], - partition=Partition.BROADCAST, dtype=par['dtype']) + partition=Partition.BROADCAST, dtype=par['dtype'], + engine=backend) x[:] = np.random.normal(1, 10, x.local_shape).astype(par['dtype']) x_global = x.asarray() y_dist = Mop @ x @@ -292,7 +304,8 @@ def test_mpilinop(par): # Adjoint x = DistributedArray(global_shape=Mop.shape[0], - partition=Partition.BROADCAST, dtype=par['dtype']) + partition=Partition.BROADCAST, dtype=par['dtype'], + engine=backend) x[:] = np.random.normal(1, 10, x.local_shape).astype(par['dtype']) x_adj_global = x.asarray() y_adj_dist = Mop.H @ x @@ -317,7 +330,7 @@ def test_fwd_mpilinop(par): FullOp_MPI = VStack_MPI @ Mop # Broadcasted DistributedArray - x = DistributedArray(global_shape=FullOp_MPI.shape[1], partition=Partition.BROADCAST, dtype=par['dtype']) + x = DistributedArray(global_shape=FullOp_MPI.shape[1], partition=Partition.BROADCAST, dtype=par['dtype'], engine=backend) x[:] = np.random.normal(1, 10, x.local_shape).astype(par['dtype']) x_global = x.asarray() @@ -346,7 +359,7 @@ def test_adj_mpilinop(par): FullOp_MPI = VStack_MPI @ Mop # Scattered DistributedArray - x = DistributedArray(global_shape=FullOp_MPI.shape[0], partition=Partition.SCATTER, dtype=par['dtype']) + x = DistributedArray(global_shape=FullOp_MPI.shape[0], partition=Partition.SCATTER, dtype=par['dtype'], engine=backend) x[:] = np.random.normal(1, 10, x.local_shape).astype(par['dtype']) x_global = x.asarray() diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py index 7def7807..65bc5452 100644 --- a/tests/test_matrixmult.py +++ b/tests/test_matrixmult.py @@ -2,9 +2,20 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_matrixmult.py --with-mpi """ +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" +import numpy as npp import math -import numpy as np -from numpy.testing import assert_allclose from mpi4py import MPI import pytest @@ -90,7 +101,7 @@ def test_MPIMatrixMult(N, K, M, dtype_str): # Create DistributedArray for input x (representing B flattened) all_local_col_len = comm.allgather(local_col_X_len) - total_cols = np.sum(all_local_col_len) + total_cols = npp.sum(all_local_col_len) x_dist = DistributedArray( global_shape=(K * total_cols), @@ -98,7 +109,8 @@ def test_MPIMatrixMult(N, K, M, dtype_str): partition=Partition.SCATTER, base_comm=comm, mask=[i % p_prime for i in range(size)], - dtype=dtype + dtype=dtype, + engine=backend ) x_dist.local_array[:] = X_p.ravel() diff --git a/tests/test_solver.py b/tests/test_solver.py index baa63bbe..c4a54315 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -2,8 +2,18 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_solver.py --with-mpi """ -import numpy as np -from numpy.testing import assert_allclose +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" from mpi4py import MPI import pytest import pylops @@ -101,18 +111,18 @@ def test_cg(par): # To make MPIBlockDiag a positive definite matrix BDiag_MPI = MPIBlockDiag(ops=[Aop, ]) - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.random.normal(1, 10, par["nx"]) + par["imag"] * np.random.normal(10, 10, par["nx"]) x_global = x.asarray() if par["x0"]: - x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x0[:] = np.random.normal(0, 10, par["nx"]) + par["imag"] * np.random.normal( 10, 10, par["nx"] ) x0_global = x0.asarray() else: # Set TO 0s if x0 = False - x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x0[:] = 0 x0_global = x0.asarray() y = BDiag_MPI * x @@ -149,18 +159,18 @@ def test_cgls(par): # To make MPIBlockDiag a positive definite matrix BDiag_MPI = MPIBlockDiag(ops=[Aop, ]) - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.random.normal(1, 10, par["nx"]) + par["imag"] * np.random.normal(10, 10, par["nx"]) x_global = x.asarray() if par["x0"]: - x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x0[:] = np.random.normal(0, 10, par["nx"]) + par["imag"] * np.random.normal( 10, 10, par["nx"] ) x0_global = x0.asarray() else: # Set TO 0s if x0 = False - x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x0[:] = 0 x0_global = x0.asarray() y = BDiag_MPI * x @@ -197,18 +207,18 @@ def test_cgls_broadcastdata(par): Aop = MatrixMult(A, dtype=par["dtype"]) HStack_MPI = MPIHStack(ops=[Aop, ]) - x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x[:] = np.random.normal(1, 10, par['nx']) + par["imag"] * np.random.normal(10, 10, par['nx']) x_global = x.asarray() if par["x0"]: - x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x0[:] = np.random.normal(0, 10, par["nx"]) + par["imag"] * np.random.normal( 10, 10, par["nx"] ) x0_global = x0.asarray() else: # Set TO 0s if x0 = False - x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + x0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) x0[:] = 0 x0_global = x0.asarray() y = HStack_MPI @ x @@ -246,18 +256,18 @@ def test_cgls_broadcastmodel(par): # To make MPIVStack a positive definite matrix VStack_MPI = MPIVStack(ops=[Aop, ]) - x = DistributedArray(global_shape=par['nx'], dtype=par['dtype'], partition=Partition.BROADCAST) + x = DistributedArray(global_shape=par['nx'], dtype=par['dtype'], partition=Partition.BROADCAST, engine=backend) x[:] = np.random.normal(1, 10, par['nx']) + par["imag"] * np.random.normal(10, 10, par['nx']) x_global = x.asarray() if par["x0"]: - x0 = DistributedArray(global_shape=par['nx'], dtype=par['dtype'], partition=Partition.BROADCAST) + x0 = DistributedArray(global_shape=par['nx'], dtype=par['dtype'], partition=Partition.BROADCAST, engine=backend) x0[:] = np.random.normal(0, 10, par["nx"]) + par["imag"] * np.random.normal( 10, 10, par["nx"] ) x0_global = x0.asarray() else: # Set TO 0s if x0 = False - x0 = DistributedArray(global_shape=par['nx'], dtype=par['dtype'], partition=Partition.BROADCAST) + x0 = DistributedArray(global_shape=par['nx'], dtype=par['dtype'], partition=Partition.BROADCAST, engine=backend) x0[:] = 0 x0_global = x0.asarray() y = VStack_MPI @ x @@ -298,19 +308,19 @@ def test_cg_stacked(par): BDiag_MPI = MPIBlockDiag(ops=[Aop, ]) StackedBDiag_MPI = MPIStackedBlockDiag(ops=[BDiag_MPI, BDiag_MPI]) - dist1 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1[:] = np.random.normal(1, 10, par["nx"]) + par["imag"] * np.random.normal(10, 10, par["nx"]) - dist2 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist2 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist2[:] = np.random.normal(5, 10, par["nx"]) + par["imag"] * np.random.normal(50, 10, par["nx"]) x = StackedDistributedArray([dist1, dist2]) x_global = x.asarray() if par["x0"]: - dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1_0[:] = np.random.normal(0, 10, par["nx"]) + par["imag"] * np.random.normal( 10, 10, par["nx"] ) - dist2_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist2_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist2_0[:] = np.random.normal(10, 10, par["nx"]) + par["imag"] * np.random.normal( 0, 10, par["nx"] ) @@ -318,9 +328,9 @@ def test_cg_stacked(par): x0_global = x0.asarray() else: # Set TO 0s if x0 = False - dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1_0[:] = 0 - dist2_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist2_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist2_0[:] = 0 x0 = StackedDistributedArray([dist1_0, dist2_0]) x0_global = x0.asarray() @@ -364,19 +374,19 @@ def test_cgls_stacked(par): VStack_MPI = MPIVStack(ops=[Aop, ]) StackedBDiag_MPI = MPIStackedBlockDiag(ops=[BDiag_MPI, VStack_MPI]) - dist1 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1[:] = np.random.normal(1, 10, par["nx"]) + par["imag"] * np.random.normal(10, 10, par["nx"]) - dist2 = DistributedArray(global_shape=par['nx'], partition=Partition.BROADCAST, dtype=par['dtype']) + dist2 = DistributedArray(global_shape=par['nx'], partition=Partition.BROADCAST, dtype=par['dtype'], engine=backend) dist2[:] = np.random.normal(5, 10, dist2.local_shape) + par["imag"] * np.random.normal(50, 10, dist2.local_shape) x = StackedDistributedArray([dist1, dist2]) x_global = x.asarray() if par["x0"]: - dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1_0[:] = np.random.normal(0, 10, par["nx"]) + par["imag"] * np.random.normal( 10, 10, par["nx"] ) - dist2_0 = DistributedArray(global_shape=par['nx'], partition=Partition.BROADCAST, dtype=par['dtype']) + dist2_0 = DistributedArray(global_shape=par['nx'], partition=Partition.BROADCAST, dtype=par['dtype'], engine=backend) dist2_0[:] = np.random.normal(10, 10, dist2_0.local_shape) + par["imag"] * np.random.normal( 0, 10, dist2_0.local_shape ) @@ -384,9 +394,9 @@ def test_cgls_stacked(par): x0_global = x0.asarray() else: # Set TO 0s if x0 = False - dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1_0 = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1_0[:] = 0 - dist2_0 = DistributedArray(global_shape=par['nx'], partition=Partition.BROADCAST, dtype=par['dtype']) + dist2_0 = DistributedArray(global_shape=par['nx'], partition=Partition.BROADCAST, dtype=par['dtype'], engine=backend) dist2_0[:] = 0 x0 = StackedDistributedArray([dist1_0, dist2_0]) x0_global = x0.asarray() diff --git a/tests/test_stack.py b/tests/test_stack.py index 5a552475..1b88a4ab 100644 --- a/tests/test_stack.py +++ b/tests/test_stack.py @@ -2,8 +2,18 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_stack.py --with-mpi """ -import numpy as np -from numpy.testing import assert_allclose +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" from mpi4py import MPI import pytest @@ -30,14 +40,16 @@ def test_vstack(par): # Broadcasted DistributedArray(global_shape == local_shape) x = pylops_mpi.DistributedArray(global_shape=par['nx'], partition=pylops_mpi.Partition.BROADCAST, - dtype=par['dtype']) + dtype=par['dtype'], + engine=backend) x[:] = np.ones(shape=par['nx'], dtype=par['dtype']) x_global = x.asarray() # Scattered DistributedArray y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], partition=pylops_mpi.Partition.SCATTER, - dtype=par['dtype']) + dtype=par['dtype'], + engine=backend) y[:] = np.ones(shape=par['ny'], dtype=par['dtype']) y_global = y.asarray() @@ -76,14 +88,15 @@ def test_stacked_vstack(par): # Broadcasted DistributedArray(global_shape == local_shape) x = pylops_mpi.DistributedArray(global_shape=par['nx'], partition=pylops_mpi.Partition.BROADCAST, - dtype=par['dtype']) + dtype=par['dtype'], + engine=backend) x[:] = np.ones(shape=par['nx'], dtype=par['dtype']) x_global = x.asarray() # Stacked DistributedArray - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype']) - dist2 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist1.local_shape, dtype=par['dtype']) y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) y_global = y.asarray() @@ -119,14 +132,16 @@ def test_hstack(par): # Scattered DistributedArray x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], partition=pylops_mpi.Partition.SCATTER, - dtype=par['dtype']) + dtype=par['dtype'], + engine=backend) x[:] = np.ones(shape=par['nx'], dtype=par['dtype']) x_global = x.asarray() # Broadcasted DistributedArray(global_shape == local_shape) y = pylops_mpi.DistributedArray(global_shape=par['ny'], partition=pylops_mpi.Partition.BROADCAST, - dtype=par['dtype']) + dtype=par['dtype'], + engine=backend) y[:] = np.ones(shape=par['ny'], dtype=par['dtype']) y_global = y.asarray() diff --git a/tests/test_stackedarray.py b/tests/test_stackedarray.py index a3f3cc23..6b49e5f5 100644 --- a/tests/test_stackedarray.py +++ b/tests/test_stackedarray.py @@ -2,9 +2,20 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_stackedarray.py --with-mpi """ -import numpy as np +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" +import numpy as npp import pytest -from numpy.testing import assert_allclose from pylops_mpi import DistributedArray, Partition, StackedDistributedArray @@ -40,10 +51,12 @@ def test_creation(par): # Create stacked array distributed_array0 = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) distributed_array1 = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) distributed_array0[:] = 0 distributed_array1[:] = 1 @@ -70,12 +83,14 @@ def test_stacked_math(par): """Test the Element-Wise Addition, Subtraction and Multiplication, Dot-product, Norm""" distributed_array0 = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) distributed_array1 = DistributedArray(global_shape=par['global_shape'], partition=par['partition'], - dtype=par['dtype'], axis=par['axis']) + dtype=par['dtype'], axis=par['axis'], + engine=backend) distributed_array0[:] = 0 - distributed_array1[:] = np.arange(np.prod(distributed_array1.local_shape)).reshape(distributed_array1.local_shape) + distributed_array1[:] = np.arange(npp.prod(distributed_array1.local_shape)).reshape(distributed_array1.local_shape) stacked_array1 = StackedDistributedArray([distributed_array0, distributed_array1]) stacked_array2 = StackedDistributedArray([distributed_array1, distributed_array0]) @@ -107,12 +122,16 @@ def test_stacked_math(par): l0norm = stacked_array1.norm(0) l1norm = stacked_array1.norm(1) l2norm = stacked_array1.norm(2) - linfnorm = stacked_array1.norm(np.inf) + + # TODO (tharitt): FAIL with CuPy + MPI for inf norm - see test_distributedarray.py + # test_distributed_nrom(par) as well +# linfnorm = stacked_array1.norm(np.inf) assert_allclose(l0norm, np.linalg.norm(stacked_array1.asarray().flatten(), 0), rtol=1e-14) assert_allclose(l1norm, np.linalg.norm(stacked_array1.asarray().flatten(), 1), rtol=1e-14) assert_allclose(l2norm, np.linalg.norm(stacked_array1.asarray(), 2), rtol=1e-10) # needed to raise it due to how partial norms are combined (with power applied) - assert_allclose(linfnorm, np.linalg.norm(stacked_array1.asarray().flatten(), np.inf), - rtol=1e-14) + # TODO (tharitt): FAIL at inf norm - see above +# assert_allclose(linfnorm, np.linalg.norm(stacked_array1.asarray().flatten(), np.inf), +# rtol=1e-14) diff --git a/tests/test_stackedlinearop.py b/tests/test_stackedlinearop.py index c31d32b6..e81a6a55 100644 --- a/tests/test_stackedlinearop.py +++ b/tests/test_stackedlinearop.py @@ -2,8 +2,18 @@ Designed to run with n processes $ mpiexec -n 10 pytest test_stackedlinearop.py --with-mpi """ -import numpy as np -from numpy.testing import assert_allclose +import os + +if int(os.environ.get("TEST_CUPY_PYLOPS", 0)): + import cupy as np + from cupy.testing import assert_allclose + + backend = "cupy" +else: + import numpy as np + from numpy.testing import assert_allclose + + backend = "numpy" from mpi4py import MPI import pytest @@ -53,9 +63,9 @@ def test_transpose(par): Top_MPI = StackedBlockDiag_MPI.T # For forward mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['ny']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) x = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) x_global = x.asarray() @@ -64,9 +74,9 @@ def test_transpose(par): Top_x_np = Top_x.asarray() # For adjoint mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['nx']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) y = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) y_global = y.asarray() @@ -96,9 +106,9 @@ def test_scaled(par): Sop_MPI = StackedBlockDiag_MPI * -4 # For forward mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['nx']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) x = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) x_global = x.asarray() @@ -107,9 +117,9 @@ def test_scaled(par): Sop_x_np = Sop_x.asarray() # For adjoint mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['ny']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) y = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) y_global = y.asarray() @@ -139,9 +149,9 @@ def test_conj(par): Cop_MPI = StackedBlockDiag_MPI.conj() # For forward mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['nx']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) x = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) x_global = x.asarray() @@ -150,9 +160,9 @@ def test_conj(par): Cop_x_np = Cop_x.asarray() # For adjoint mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['ny']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) y = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) y_global = y.asarray() @@ -182,9 +192,9 @@ def test_power(par): Pop_MPI = StackedBlockDiag_MPI.conj() # For forward mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['nx']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) x = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) x_global = x.asarray() @@ -193,9 +203,9 @@ def test_power(par): Pop_x_np = Pop_x.asarray() # For adjoint mode - dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist_1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist_1[:] = np.ones(par['ny']) - dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype']) + dist_2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend) dist_2[:] = np.ones(dist_2.local_shape) y = pylops_mpi.StackedDistributedArray(distarrays=[dist_1, dist_2]) y_global = y.asarray() @@ -230,9 +240,9 @@ def test_sum(par): Sop_MPI = StackedBDiag_MPI_1 + StackedBDiag_MPI_2 # Forward Mode - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape) - dist2 = pylops_mpi.DistributedArray(global_shape=par['ny'] * par['nx'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=par['ny'] * par['nx'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist2.local_shape) x = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) x_global = x.asarray() @@ -240,9 +250,9 @@ def test_sum(par): Sop_x_np = Sop_x.asarray() # Adjoint Mode - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape) - dist2 = pylops_mpi.DistributedArray(global_shape=par['ny'] * par['nx'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=par['ny'] * par['nx'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist2.local_shape) y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) y_global = y.asarray() @@ -280,9 +290,9 @@ def test_product(par): Pop_MPI = StackedBDiag_MPI_1 * StackedBDiag_MPI_2 # Forward Mode - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape) - dist2 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist2.local_shape) x = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) x_global = x.asarray() @@ -290,9 +300,9 @@ def test_product(par): Pop_x_np = Pop_x.asarray() # Adjoint Mode - dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype']) + dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) dist1[:] = np.ones(dist1.local_shape) - dist2 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype']) + dist2 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) dist2[:] = np.ones(dist2.local_shape) y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2]) y_global = y.asarray() From 8d458b09778617c5ab76297cd3bdd68e192faa02 Mon Sep 17 00:00:00 2001 From: mrava87 Date: Sun, 3 Aug 2025 21:08:34 +0000 Subject: [PATCH 2/6] test: temporary fix to test_power --- tests/test_linearop.py | 67 +++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/tests/test_linearop.py b/tests/test_linearop.py index fc986f09..9be59580 100644 --- a/tests/test_linearop.py +++ b/tests/test_linearop.py @@ -135,39 +135,40 @@ def test_scaled(par): assert_allclose(Sop_y_np, Sop.H @ y_global, rtol=1e-9) -# @pytest.mark.mpi(min_size=2) -# @pytest.mark.parametrize("par", [(par1), (par1j)]) -# def test_power(par): -# """Test the PowerLinearOperator""" -# Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype'])) -# BDiag_MPI = MPIBlockDiag(ops=[Op, ]) -# # Power Operator -# Pop_MPI = BDiag_MPI ** 3 - -# # Forward Mode -# x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) -# x[:] = np.ones(par['nx']) -# x_global = x.asarray() -# Pop_x = Pop_MPI @ x -# assert isinstance(Pop_x, DistributedArray) -# Pop_x_np = Pop_x.asarray() - -# # Adjoint Mode -# y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) -# y[:] = np.ones(par['ny']) -# y_global = y.asarray() -# Pop_y = Pop_MPI.H @ y -# assert isinstance(Pop_y, DistributedArray) -# Pop_y_np = Pop_y.asarray() - -# if rank == 0: -# ops = [pylops.MatrixMult((i + 1) * np.ones(shape=(par['ny'], par['nx'])).astype(par['dtype'])) for i in -# range(size)] -# BDiag = pylops.BlockDiag(ops=ops) -# # TODO (tharitt): Fail PyLops Op ** 3 does not preserve CuPy (it turns to NumPy) -# Pop = BDiag ** 3 -# assert_allclose(Pop_x_np, Pop @ x_global, rtol=1e-9) -# assert_allclose(Pop_y_np, Pop.H @ y_global, rtol=1e-9) +@pytest.mark.mpi(min_size=2) +@pytest.mark.parametrize("par", [(par1), (par1j)]) +def test_power(par): + """Test the PowerLinearOperator""" + Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype']), + dtype=par['dtype']) + BDiag_MPI = MPIBlockDiag(ops=[Op, ]) + + # Power Operator + Pop_MPI = BDiag_MPI ** 3 + + # Forward Mode + x = DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend) + x[:] = np.ones(par['nx']) + x_global = x.asarray() + Pop_x = Pop_MPI @ x + assert isinstance(Pop_x, DistributedArray) + Pop_x_np = Pop_x.asarray() + + # Adjoint Mode + y = DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend) + y[:] = np.ones(par['ny']) + y_global = y.asarray() + Pop_y = Pop_MPI.H @ y + assert isinstance(Pop_y, DistributedArray) + Pop_y_np = Pop_y.asarray() + + if rank == 0: + ops = [pylops.MatrixMult((i + 1) * np.ones(shape=(par['ny'], par['nx'])).astype(par['dtype'])) for i in + range(size)] + BDiag = pylops.BlockDiag(ops=ops) + Pop = BDiag * BDiag * BDiag ## temporarely replaced BDiag ** 3 until bug in PyLops is fixed + assert_allclose(Pop_x_np, Pop @ x_global, rtol=1e-9) + assert_allclose(Pop_y_np, Pop.H @ y_global, rtol=1e-9) @pytest.mark.mpi(min_size=2) From 33121a5f221ee89799f306de99b0b9e609a50824 Mon Sep 17 00:00:00 2001 From: tharittk Date: Mon, 4 Aug 2025 09:52:02 -0500 Subject: [PATCH 3/6] temporary use CPU buffer for CuPy + MPI in inf and -inf norm --- pylops_mpi/DistributedArray.py | 23 +++++++++++++++++------ tests/test_distributedarray.py | 8 ++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py index 5807a518..2f3bc9a6 100644 --- a/pylops_mpi/DistributedArray.py +++ b/pylops_mpi/DistributedArray.py @@ -694,14 +694,25 @@ def _compute_vector_norm(self, local_array: NDArray, recv_buf = self._allreduce_subcomm(ncp.count_nonzero(local_array, axis=axis).astype(ncp.float64)) elif ord == ncp.inf: # Calculate max followed by max reduction - recv_buf = self._allreduce_subcomm(ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64), - recv_buf, op=MPI.MAX) - recv_buf = ncp.squeeze(recv_buf, axis=axis) + # TODO (tharitt): currently CuPy + MPI does not work well with buffered communication, particularly + # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs + send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64) + if self.engine=="cupy" and self.base_comm_nccl is None: + recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX) + recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis)) + else: + recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MAX) + recv_buf = ncp.squeeze(recv_buf, axis=axis) elif ord == -ncp.inf: # Calculate min followed by min reduction - recv_buf = self._allreduce_subcomm(ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64), - recv_buf, op=MPI.MIN) - recv_buf = ncp.squeeze(recv_buf, axis=axis) + # TODO (tharitt): see the comment above in infinity norm + send_buf = ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64) + if self.engine == "cupy" and self.base_comm_nccl is None: + recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MIN) + recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis)) + else: + recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MIN) + recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis)) else: recv_buf = self._allreduce_subcomm(ncp.sum(ncp.abs(ncp.float_power(local_array, ord)), axis=axis)) diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py index 9efeb71f..3569ebef 100644 --- a/tests/test_distributedarray.py +++ b/tests/test_distributedarray.py @@ -205,8 +205,8 @@ def test_distributed_norm(par): np.linalg.norm(par['x'], ord=1, axis=par['axis']), rtol=1e-14) # TODO (tharitt): FAIL with CuPy + MPI for inf norm - # assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), - # np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) + assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), + np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) assert_allclose(arr.norm(), np.linalg.norm(par['x'].flatten()), rtol=1e-13) @@ -335,7 +335,7 @@ def test_distributed_maskednorm(par): np.linalg.norm(par['x'], ord=1, axis=par['axis']) / nsub, rtol=1e-14) # TODO (tharitt): Fail with CuPy + MPI - # assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), - # np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) + assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), + np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) assert_allclose(arr.norm(ord=2, axis=par['axis']), np.linalg.norm(par['x'], ord=2, axis=par['axis']) / np.sqrt(nsub), rtol=1e-13) From 66e3b166a6a14ae4dd74558c87a79bce92daf1b6 Mon Sep 17 00:00:00 2001 From: tharittk Date: Tue, 5 Aug 2025 08:01:33 -0500 Subject: [PATCH 4/6] ensure unqiue gpu device for each mpi rank in CuPy MPI tests --- pylops_mpi/DistributedArray.py | 2 +- tests/test_blockdiag.py | 8 ++++++++ tests/test_derivative.py | 8 ++++++++ tests/test_distributedarray.py | 10 +++++++++- tests/test_fredholm.py | 7 +++++++ tests/test_linearop.py | 11 +++++++++-- tests/test_matrixmult.py | 18 +++++++++++++----- tests/test_solver.py | 7 +++++++ tests/test_stack.py | 9 +++++++++ tests/test_stackedarray.py | 10 ++++++++++ tests/test_stackedlinearop.py | 8 ++++++++ 11 files changed, 89 insertions(+), 9 deletions(-) diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py index 2f3bc9a6..979882c0 100644 --- a/pylops_mpi/DistributedArray.py +++ b/pylops_mpi/DistributedArray.py @@ -697,7 +697,7 @@ def _compute_vector_norm(self, local_array: NDArray, # TODO (tharitt): currently CuPy + MPI does not work well with buffered communication, particularly # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64) - if self.engine=="cupy" and self.base_comm_nccl is None: + if self.engine == "cupy" and self.base_comm_nccl is None: recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX) recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis)) else: diff --git a/tests/test_blockdiag.py b/tests/test_blockdiag.py index e98fb331..379245cb 100644 --- a/tests/test_blockdiag.py +++ b/tests/test_blockdiag.py @@ -27,6 +27,14 @@ par2j = {'ny': 301, 'nx': 101, 'dtype': np.complex128} np.random.seed(42) +rank = MPI.COMM_WORLD.Get_rank() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() @pytest.mark.mpi(min_size=2) diff --git a/tests/test_derivative.py b/tests/test_derivative.py index fd2473f6..37f21782 100644 --- a/tests/test_derivative.py +++ b/tests/test_derivative.py @@ -25,6 +25,14 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() + par1 = { "nz": 600, diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py index 3569ebef..ebeacdf3 100644 --- a/tests/test_distributedarray.py +++ b/tests/test_distributedarray.py @@ -21,6 +21,14 @@ from pylops_mpi.DistributedArray import local_split np.random.seed(42) +rank = MPI.COMM_WORLD.Get_rank() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() par1 = {'global_shape': (500, 501), 'partition': Partition.SCATTER, 'dtype': np.float64, @@ -206,7 +214,7 @@ def test_distributed_norm(par): # TODO (tharitt): FAIL with CuPy + MPI for inf norm assert_allclose(arr.norm(ord=np.inf, axis=par['axis']), - np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) + np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14) assert_allclose(arr.norm(), np.linalg.norm(par['x'].flatten()), rtol=1e-13) diff --git a/tests/test_fredholm.py b/tests/test_fredholm.py index a7ca26f8..8be2ca64 100644 --- a/tests/test_fredholm.py +++ b/tests/test_fredholm.py @@ -29,6 +29,13 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() par1 = { "nsl": 21, diff --git a/tests/test_linearop.py b/tests/test_linearop.py index 9be59580..5084190f 100644 --- a/tests/test_linearop.py +++ b/tests/test_linearop.py @@ -31,6 +31,13 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() par1 = {'ny': 101, 'nx': 101, 'dtype': np.float64} par1j = {'ny': 101, 'nx': 101, 'dtype': np.complex128} @@ -142,7 +149,7 @@ def test_power(par): Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype']), dtype=par['dtype']) BDiag_MPI = MPIBlockDiag(ops=[Op, ]) - + # Power Operator Pop_MPI = BDiag_MPI ** 3 @@ -166,7 +173,7 @@ def test_power(par): ops = [pylops.MatrixMult((i + 1) * np.ones(shape=(par['ny'], par['nx'])).astype(par['dtype'])) for i in range(size)] BDiag = pylops.BlockDiag(ops=ops) - Pop = BDiag * BDiag * BDiag ## temporarely replaced BDiag ** 3 until bug in PyLops is fixed + Pop = BDiag * BDiag * BDiag # temporarely replaced BDiag ** 3 until bug in PyLops is fixed assert_allclose(Pop_x_np, Pop @ x_global, rtol=1e-9) assert_allclose(Pop_y_np, Pop.H @ y_global, rtol=1e-9) diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py index 65bc5452..01de6f19 100644 --- a/tests/test_matrixmult.py +++ b/tests/test_matrixmult.py @@ -19,14 +19,21 @@ from mpi4py import MPI import pytest -from pylops.basicoperators import FirstDerivative, Identity +from pylops.basicoperators import FirstDerivative from pylops_mpi import DistributedArray, Partition from pylops_mpi.basicoperators import MPIMatrixMult, MPIBlockDiag np.random.seed(42) base_comm = MPI.COMM_WORLD size = base_comm.Get_size() - +rank = MPI.COMM_WORLD.Get_rank() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() # Define test cases: (N, K, M, dtype_str) # M, K, N are matrix dimensions A(N,K), B(K,M) # P_prime will be ceil(sqrt(size)). @@ -39,6 +46,7 @@ pytest.param(2, 1, 3, "float32", id="f32_2_1_3",), ] + def _reorganize_local_matrix(x_dist, N, M, blk_cols, p_prime): """Re-organize distributed array in local matrix """ @@ -66,9 +74,9 @@ def test_MPIMatrixMult(N, K, M, dtype_str): cmplx = 1j if np.issubdtype(dtype, np.complexfloating) else 0 base_float_dtype = np.float32 if dtype == np.complex64 else np.float64 - comm, rank, row_id, col_id, is_active = \ - MPIMatrixMult.active_grid_comm(base_comm, N, M) - if not is_active: return + comm, rank, row_id, col_id, is_active = MPIMatrixMult.active_grid_comm(base_comm, N, M) + if not is_active: + return size = comm.Get_size() p_prime = math.isqrt(size) diff --git a/tests/test_solver.py b/tests/test_solver.py index c4a54315..cba71f5f 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -38,6 +38,13 @@ size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() par1 = { "ny": 11, diff --git a/tests/test_stack.py b/tests/test_stack.py index 1b88a4ab..73c6ea49 100644 --- a/tests/test_stack.py +++ b/tests/test_stack.py @@ -21,6 +21,15 @@ import pylops_mpi from pylops_mpi.utils.dottest import dottest +rank = MPI.COMM_WORLD.Get_rank() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() + par1 = {'ny': 101, 'nx': 101, 'imag': 0, 'dtype': np.float64} par1j = {'ny': 101, 'nx': 101, 'imag': 1j, 'dtype': np.complex128} par2 = {'ny': 301, 'nx': 101, 'imag': 0, 'dtype': np.float64} diff --git a/tests/test_stackedarray.py b/tests/test_stackedarray.py index 6b49e5f5..624b449b 100644 --- a/tests/test_stackedarray.py +++ b/tests/test_stackedarray.py @@ -17,9 +17,19 @@ import numpy as npp import pytest +from mpi4py import MPI from pylops_mpi import DistributedArray, Partition, StackedDistributedArray np.random.seed(42) +rank = MPI.COMM_WORLD.Get_rank() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() + par1 = {'global_shape': (500, 501), 'partition': Partition.SCATTER, 'dtype': np.float64, diff --git a/tests/test_stackedlinearop.py b/tests/test_stackedlinearop.py index e81a6a55..9dc4a56f 100644 --- a/tests/test_stackedlinearop.py +++ b/tests/test_stackedlinearop.py @@ -24,6 +24,14 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() +if backend == "cupy": + device_count = np.cuda.runtime.getDeviceCount() + device_id = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") + or rank % np.cuda.runtime.getDeviceCount() + ) + np.cuda.Device(device_id).use() + par1 = {'ny': 101, 'nx': 101, 'dtype': np.float64} par1j = {'ny': 101, 'nx': 101, 'dtype': np.complex128} From 99ff24bf76f30b39ace8e7bba750cdc9aca1aa84 Mon Sep 17 00:00:00 2001 From: tharittk Date: Wed, 6 Aug 2025 10:11:24 -0500 Subject: [PATCH 5/6] remove slurm-related env in test scripts, fix Makefile type, add tests_gpu --- Makefile | 6 +++++- tests/test_blockdiag.py | 6 +----- tests/test_derivative.py | 6 +----- tests/test_distributedarray.py | 6 +----- tests/test_fredholm.py | 6 +----- tests/test_linearop.py | 6 +----- tests/test_matrixmult.py | 7 ++----- tests/test_solver.py | 6 +----- tests/test_stack.py | 6 +----- tests/test_stackedarray.py | 6 +----- tests/test_stackedlinearop.py | 6 +----- 11 files changed, 16 insertions(+), 51 deletions(-) diff --git a/Makefile b/Makefile index 409438d0..84389239 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ dev-install: dev-install_nccl: make pipcheck - $(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12 $(PIP) install -e . + $(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12 && $(PIP) install -e . install_conda: conda env create -f environment.yml && conda activate pylops_mpi && pip install . @@ -49,6 +49,10 @@ lint: tests: mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi +# assuming NUM_PROCESSES <= number of gpus available +tests_gpu: + export TEST_CUPY_PYLOPS=1 && mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi + # assuming NUM_PROCESSES <= number of gpus available tests_nccl: mpiexec -n $(NUM_PROCESSES) pytest tests_nccl/ --with-mpi diff --git a/tests/test_blockdiag.py b/tests/test_blockdiag.py index 379245cb..e2688f42 100644 --- a/tests/test_blockdiag.py +++ b/tests/test_blockdiag.py @@ -29,11 +29,7 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() diff --git a/tests/test_derivative.py b/tests/test_derivative.py index 37f21782..5ef6eef6 100644 --- a/tests/test_derivative.py +++ b/tests/test_derivative.py @@ -26,11 +26,7 @@ rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py index ebeacdf3..8e8fe529 100644 --- a/tests/test_distributedarray.py +++ b/tests/test_distributedarray.py @@ -23,11 +23,7 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() par1 = {'global_shape': (500, 501), diff --git a/tests/test_fredholm.py b/tests/test_fredholm.py index 8be2ca64..883b2940 100644 --- a/tests/test_fredholm.py +++ b/tests/test_fredholm.py @@ -30,11 +30,7 @@ rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() par1 = { diff --git a/tests/test_linearop.py b/tests/test_linearop.py index 5084190f..626db251 100644 --- a/tests/test_linearop.py +++ b/tests/test_linearop.py @@ -32,11 +32,7 @@ rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() par1 = {'ny': 101, 'nx': 101, 'dtype': np.float64} diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py index 01de6f19..f785275c 100644 --- a/tests/test_matrixmult.py +++ b/tests/test_matrixmult.py @@ -28,12 +28,9 @@ size = base_comm.Get_size() rank = MPI.COMM_WORLD.Get_rank() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() + # Define test cases: (N, K, M, dtype_str) # M, K, N are matrix dimensions A(N,K), B(K,M) # P_prime will be ceil(sqrt(size)). diff --git a/tests/test_solver.py b/tests/test_solver.py index cba71f5f..22b0baf8 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -39,11 +39,7 @@ size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() par1 = { diff --git a/tests/test_stack.py b/tests/test_stack.py index 73c6ea49..31d8c92b 100644 --- a/tests/test_stack.py +++ b/tests/test_stack.py @@ -23,11 +23,7 @@ rank = MPI.COMM_WORLD.Get_rank() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() par1 = {'ny': 101, 'nx': 101, 'imag': 0, 'dtype': np.float64} diff --git a/tests/test_stackedarray.py b/tests/test_stackedarray.py index 624b449b..df9fcdfd 100644 --- a/tests/test_stackedarray.py +++ b/tests/test_stackedarray.py @@ -23,11 +23,7 @@ np.random.seed(42) rank = MPI.COMM_WORLD.Get_rank() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() diff --git a/tests/test_stackedlinearop.py b/tests/test_stackedlinearop.py index 9dc4a56f..469c9267 100644 --- a/tests/test_stackedlinearop.py +++ b/tests/test_stackedlinearop.py @@ -25,11 +25,7 @@ rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() if backend == "cupy": - device_count = np.cuda.runtime.getDeviceCount() - device_id = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") - or rank % np.cuda.runtime.getDeviceCount() - ) + device_id = rank % np.cuda.runtime.getDeviceCount() np.cuda.Device(device_id).use() From af8977f6254fe625bc5af27a11ea378f856dcd92 Mon Sep 17 00:00:00 2001 From: tharittk Date: Thu, 7 Aug 2025 09:14:03 -0500 Subject: [PATCH 6/6] add documentation on contributing page --- docs/source/contributing.rst | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 43899d48..a3ef96a0 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -69,6 +69,18 @@ that the both old and new tests pass successfully: >> make tests +If you run PyLops-MPI with GPUs you may also do: + +.. code-block:: bash + + >> make tests_gpu + +Additionally, if you have a NCCL-enabled environment, you may also check: + +.. code-block:: bash + + >> make tests_nccl + 4. Make sure the ``examples`` python scripts are executed using 3 processes without any errors: .. code-block:: bash @@ -123,8 +135,11 @@ Project structure This repository is organized as follows: * **pylops_mpi**: Python library containing various mpi linear operators. -* **tests**: Set of tests using pytest-mpi. +* **tests**: Set of tests using pytest-mpi for both CPU and GPU. +* **tests_nccl** Set of tests for NCCL-enabled environment using pytest-mpi * **testdata**: Sample datasets used in tests and documentation. * **docs**: Sphinx documentation. * **examples**: Set of python script examples for each mpi linear operator to be embedded in documentation using sphinx-gallery. -* **tutorials**: Set of python script tutorials to be embedded in documentation using sphinx-gallery. +* **tutorials**: Set of python script tutorials (NumPy & MPI) to be embedded in documentation using sphinx-gallery. +* **tutorials_cupy**: Same set of scripts as above but with CuPy & MPI +* **tutorials_nccl**: Same set of scripts as above but with CuPy & NCCL