PyLops · mrava87 · Aug 8, 2025 · Aug 3, 2025 · Aug 3, 2025 · Aug 4, 2025
diff --git a/Makefile b/Makefile
@@ -29,7 +29,7 @@ dev-install:
 
 dev-install_nccl:
 	make pipcheck
-	$(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12  $(PIP) install -e .
+	$(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12 && $(PIP) install -e .
 
 install_conda:
 	conda env create -f environment.yml && conda activate pylops_mpi && pip install .
@@ -49,6 +49,10 @@ lint:
 tests:
 	mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi
 
+# assuming NUM_PROCESSES <= number of gpus available
+tests_gpu:
+	export TEST_CUPY_PYLOPS=1 && mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi
+
 # assuming NUM_PROCESSES <= number of gpus available
 tests_nccl:	
 	mpiexec -n $(NUM_PROCESSES) pytest tests_nccl/ --with-mpi

diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
@@ -69,6 +69,18 @@ that the both old and new tests pass successfully:
 
    >> make tests
 
+If you run PyLops-MPI with GPUs you may also do:
+
+.. code-block:: bash
+
+   >> make tests_gpu
+
+Additionally, if you have a NCCL-enabled environment, you may also check:
+
+.. code-block:: bash
+
+   >> make tests_nccl
+
 4. Make sure the ``examples`` python scripts are executed using 3 processes without any errors:
 
 .. code-block:: bash
@@ -123,8 +135,11 @@ Project structure
 This repository is organized as follows:
 
 * **pylops_mpi**: Python library containing various mpi linear operators.
-* **tests**:      Set of tests using pytest-mpi.
+* **tests**:      Set of tests using pytest-mpi for both CPU and GPU.
+* **tests_nccl**  Set of tests for NCCL-enabled environment using pytest-mpi
 * **testdata**:   Sample datasets used in tests and documentation.
 * **docs**:       Sphinx documentation.
 * **examples**:   Set of python script examples for each mpi linear operator to be embedded in documentation using sphinx-gallery.
-* **tutorials**:  Set of python script tutorials to be embedded in documentation using sphinx-gallery.
+* **tutorials**:  Set of python script tutorials (NumPy & MPI) to be embedded in documentation using sphinx-gallery.
+* **tutorials_cupy**:  Same set of scripts as above but with CuPy & MPI 
+* **tutorials_nccl**:  Same set of scripts as above but with CuPy & NCCL
diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -694,14 +694,25 @@ def _compute_vector_norm(self, local_array: NDArray,
             recv_buf = self._allreduce_subcomm(ncp.count_nonzero(local_array, axis=axis).astype(ncp.float64))
         elif ord == ncp.inf:
             # Calculate max followed by max reduction
-            recv_buf = self._allreduce_subcomm(ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64),
-                                               recv_buf, op=MPI.MAX)
-            recv_buf = ncp.squeeze(recv_buf, axis=axis)
+            # TODO (tharitt): currently CuPy + MPI does not work well with buffered communication, particularly
+            # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs
+            send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64)
+            if self.engine == "cupy" and self.base_comm_nccl is None:
+                recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX)
+                recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
+            else:
+                recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MAX)
+                recv_buf = ncp.squeeze(recv_buf, axis=axis)
         elif ord == -ncp.inf:
             # Calculate min followed by min reduction
-            recv_buf = self._allreduce_subcomm(ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64),
-                                               recv_buf, op=MPI.MIN)
-            recv_buf = ncp.squeeze(recv_buf, axis=axis)
+            # TODO (tharitt): see the comment above in infinity norm
+            send_buf = ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64)
+            if self.engine == "cupy" and self.base_comm_nccl is None:
+                recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MIN)
+                recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
+            else:
+                recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MIN)
+                recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
 
         else:
             recv_buf = self._allreduce_subcomm(ncp.sum(ncp.abs(ncp.float_power(local_array, ord)), axis=axis))

diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -232,7 +232,8 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
             mask=x.mask,
             partition=Partition.SCATTER,
             dtype=self.dtype,
-            base_comm=self.base_comm
+            base_comm=self.base_comm,
+            engine=x.engine
         )
 
         my_own_cols = self._rank_col_lens[self.rank]
@@ -257,7 +258,8 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
             mask=x.mask,
             partition=Partition.SCATTER,
             dtype=self.dtype,
-            base_comm=self.base_comm
+            base_comm=self.base_comm,
+            engine=x.engine
         )
 
         x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(self.dtype)

diff --git a/tests/test_blockdiag.py b/tests/test_blockdiag.py
@@ -2,9 +2,19 @@
     Designed to run with n processes
     $ mpiexec -n 10 pytest test_blockdiag.py --with-mpi
 """
+import os
+
+if int(os.environ.get("TEST_CUPY_PYLOPS", 0)):
+    import cupy as np
+    from cupy.testing import assert_allclose
+
+    backend = "cupy"
+else:
+    import numpy as np
+    from numpy.testing import assert_allclose
+
+    backend = "numpy"
 from mpi4py import MPI
-import numpy as np
-from numpy.testing import assert_allclose
 import pytest
 
 import pylops
@@ -17,6 +27,10 @@
 par2j = {'ny': 301, 'nx': 101, 'dtype': np.complex128}
 
 np.random.seed(42)
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_id = rank % np.cuda.runtime.getDeviceCount()
+    np.cuda.Device(device_id).use()
 
 
 @pytest.mark.mpi(min_size=2)
@@ -27,11 +41,11 @@ def test_blockdiag(par):
     Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype']))
     BDiag_MPI = pylops_mpi.MPIBlockDiag(ops=[Op, ])
 
-    x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'])
+    x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend)
     x[:] = np.ones(shape=par['nx'], dtype=par['dtype'])
     x_global = x.asarray()
 
-    y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'])
+    y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend)
     y[:] = np.ones(shape=par['ny'], dtype=par['dtype'])
     y_global = y.asarray()
 
@@ -68,16 +82,16 @@ def test_stacked_blockdiag(par):
     FirstDeriv_MPI = pylops_mpi.MPIFirstDerivative(dims=(par['ny'], par['nx']), dtype=par['dtype'])
     StackedBDiag_MPI = pylops_mpi.MPIStackedBlockDiag(ops=[BDiag_MPI, FirstDeriv_MPI])
 
-    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'])
+    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend)
     dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype'])
-    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'])
+    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend)
     dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype'])
     x = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
     x_global = x.asarray()
 
-    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'])
+    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend)
     dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype'])
-    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'])
+    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend)
     dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype'])
     y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
     y_global = y.asarray()

diff --git a/tests/test_derivative.py b/tests/test_derivative.py
@@ -2,9 +2,20 @@
     Designed to run with n processes
     $ mpiexec -n 10 pytest test_derivative.py --with-mpi
 """
-import numpy as np
+import os
+
+if int(os.environ.get("TEST_CUPY_PYLOPS", 0)):
+    import cupy as np
+    from cupy.testing import assert_allclose
+
+    backend = "cupy"
+else:
+    import numpy as np
+    from numpy.testing import assert_allclose
+
+    backend = "numpy"
+import numpy as npp
 from mpi4py import MPI
-from numpy.testing import assert_allclose
 import pytest
 
 import pylops
@@ -14,6 +25,10 @@
 np.random.seed(42)
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
+if backend == "cupy":
+    device_id = rank % np.cuda.runtime.getDeviceCount()
+    np.cuda.Device(device_id).use()
+
 
 par1 = {
     "nz": 600,
@@ -189,8 +204,8 @@ def test_first_derivative_forward(par):
     Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'],
                                             kind="forward", edge=par['edge'],
                                             dtype=par['dtype'])
-    x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
-                                    partition=par['partition'])
+    x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
+                                    partition=par['partition'], engine=backend)
     x[:] = np.random.normal(rank, 10, x.local_shape)
     x_global = x.asarray()
     # Forward
@@ -200,7 +215,7 @@ def test_first_derivative_forward(par):
     y_adj_dist = Fop_MPI.H @ x
     y_adj = y_adj_dist.asarray()
     # Dot test
-    dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
+    dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
 
     if rank == 0:
         Fop = pylops.FirstDerivative(dims=par['nz'], axis=0,
@@ -223,8 +238,8 @@ def test_first_derivative_backward(par):
     Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'],
                                             kind="backward", edge=par['edge'],
                                             dtype=par['dtype'])
-    x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
-                                    partition=par['partition'])
+    x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
+                                    partition=par['partition'], engine=backend)
     x[:] = np.random.normal(rank, 10, x.local_shape)
     x_global = x.asarray()
     # Forward
@@ -234,7 +249,7 @@ def test_first_derivative_backward(par):
     y_adj_dist = Fop_MPI.H @ x
     y_adj = y_adj_dist.asarray()
     # Dot test
-    dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
+    dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
 
     if rank == 0:
         Fop = pylops.FirstDerivative(dims=par['nz'], axis=0,
@@ -258,8 +273,8 @@ def test_first_derivative_centered(par):
         Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'],
                                                 kind="centered", edge=par['edge'],
                                                 order=order, dtype=par['dtype'])
-        x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
-                                        partition=par['partition'])
+        x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
+                                        partition=par['partition'], engine=backend)
         x[:] = np.random.normal(rank, 10, x.local_shape)
         x_global = x.asarray()
         # Forward
@@ -269,7 +284,7 @@ def test_first_derivative_centered(par):
         y_adj_dist = Fop_MPI.H @ x
         y_adj = y_adj_dist.asarray()
         # Dot test
-        dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
+        dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
 
         if rank == 0:
             Fop = pylops.FirstDerivative(dims=par['nz'], axis=0,
@@ -292,8 +307,8 @@ def test_second_derivative_forward(par):
     Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'],
                                                             kind="forward", edge=par['edge'],
                                                             dtype=par['dtype'])
-    x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
-                                    partition=par['partition'])
+    x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
+                                    partition=par['partition'], engine=backend)
     x[:] = np.random.normal(rank, 10, x.local_shape)
     x_global = x.asarray()
     # Forward
@@ -303,7 +318,7 @@ def test_second_derivative_forward(par):
     y_adj_dist = Sop_MPI.H @ x
     y_adj = y_adj_dist.asarray()
     # Dot test
-    dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
+    dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
 
     if rank == 0:
         Sop = pylops.SecondDerivative(dims=par['nz'], axis=0,
@@ -326,8 +341,8 @@ def test_second_derivative_backward(par):
     Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'],
                                                             kind="backward", edge=par['edge'],
                                                             dtype=par['dtype'])
-    x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
-                                    partition=par['partition'])
+    x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
+                                    partition=par['partition'], engine=backend)
     x[:] = np.random.normal(rank, 10, x.local_shape)
     x_global = x.asarray()
     # Forward
@@ -337,7 +352,7 @@ def test_second_derivative_backward(par):
     y_adj_dist = Sop_MPI.H @ x
     y_adj = y_adj_dist.asarray()
     # Dot test
-    dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
+    dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
 
     if rank == 0:
         Sop = pylops.SecondDerivative(dims=par['nz'], axis=0,
@@ -360,8 +375,8 @@ def test_second_derivative_centered(par):
     Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'],
                                                             kind="centered", edge=par['edge'],
                                                             dtype=par['dtype'])
-    x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
-                                    partition=par['partition'])
+    x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
+                                    partition=par['partition'], engine=backend)
     x[:] = np.random.normal(rank, 10, x.local_shape)
     x_global = x.asarray()
     # Forward
@@ -371,7 +386,7 @@ def test_second_derivative_centered(par):
     y_adj_dist = Sop_MPI.H @ x
     y_adj = y_adj_dist.asarray()
     # Dot test
-    dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
+    dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
 
     if rank == 0:
         Sop = pylops.SecondDerivative(dims=par['nz'], axis=0,
@@ -394,7 +409,7 @@ def test_laplacian(par):
                                                          weights=par['weights'], sampling=par['sampling'],
                                                          kind=kind, edge=par['edge'],
                                                          dtype=par['dtype'])
-        x = pylops_mpi.DistributedArray(global_shape=np.prod(par['n']), dtype=par['dtype'])
+        x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['n']), dtype=par['dtype'], engine=backend)
         x[:] = np.random.normal(rank, 10, x.local_shape)
         x_global = x.asarray()
         # Forward
@@ -404,7 +419,7 @@ def test_laplacian(par):
         y_adj_dist = Lop_MPI.H @ x
         y_adj = y_adj_dist.asarray()
         # Dot test
-        dottest(Lop_MPI, x, y_dist, np.prod(par['n']), np.prod(par['n']))
+        dottest(Lop_MPI, x, y_dist, npp.prod(par['n']), npp.prod(par['n']))
 
         if rank == 0:
             Lop = pylops.Laplacian(dims=par['n'], axes=par['axes'],
@@ -426,7 +441,7 @@ def test_gradient(par):
         Gop_MPI = pylops_mpi.basicoperators.MPIGradient(dims=par['n'], sampling=par['sampling'],
                                                         kind=kind, edge=par['edge'],
                                                         dtype=par['dtype'])
-        x_fwd = pylops_mpi.DistributedArray(global_shape=np.prod(par['n']), dtype=par['dtype'])
+        x_fwd = pylops_mpi.DistributedArray(global_shape=npp.prod(par['n']), dtype=par['dtype'], engine=backend)
         x_fwd[:] = np.random.normal(rank, 10, x_fwd.local_shape)
         x_global = x_fwd.asarray()
 
@@ -436,11 +451,11 @@ def test_gradient(par):
         y = y_dist.asarray()
 
         # Adjoint
-        x_adj_dist1 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype'])
+        x_adj_dist1 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend)
         x_adj_dist1[:] = np.random.normal(rank, 10, x_adj_dist1.local_shape)
-        x_adj_dist2 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype'])
+        x_adj_dist2 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend)
         x_adj_dist2[:] = np.random.normal(rank, 20, x_adj_dist2.local_shape)
-        x_adj_dist3 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype'])
+        x_adj_dist3 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend)
         x_adj_dist3[:] = np.random.normal(rank, 30, x_adj_dist3.local_shape)
         x_adj = pylops_mpi.StackedDistributedArray(distarrays=[x_adj_dist1, x_adj_dist2, x_adj_dist3])
         x_adj_global = x_adj.asarray()
@@ -449,7 +464,7 @@ def test_gradient(par):
         y_adj = y_adj_dist.asarray()
 
         # Dot test
-        dottest(Gop_MPI, x_fwd, y_dist, len(par['n']) * np.prod(par['n']), np.prod(par['n']))
+        dottest(Gop_MPI, x_fwd, y_dist, len(par['n']) * npp.prod(par['n']), npp.prod(par['n']))
 
         if rank == 0:
             Gop = pylops.Gradient(dims=par['n'], sampling=par['sampling'],