Replaced cuda with gpu to allow for compiling on HIP

bienz2 · bienz2 · commit bcf92c22b45b · 2025-07-28T10:02:05.000-07:00
diff --git a/src/heterogeneous/gpu_alltoall.c b/src/heterogeneous/gpu_alltoall.c
@@ -24,13 +24,13 @@ int gpu_aware_alltoall(alltoall_ftn f,
 
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
 
     int ierr = f(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm);
 
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
 
     return ierr;
 }
@@ -94,8 +94,8 @@ int copy_to_cpu_alltoall(alltoall_ftn f,
 
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
 
     // Copy from GPU to CPU
     ierr += gpuMemcpy(cpu_sendbuf, sendbuf, total_bytes_s, gpuMemcpyDeviceToHost);
@@ -106,8 +106,8 @@ int copy_to_cpu_alltoall(alltoall_ftn f,
     // Copy from CPU to GPU
     ierr += gpuMemcpy(recvbuf, cpu_recvbuf, total_bytes_r, gpuMemcpyHostToDevice);
 
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
     
     return ierr;
 }
@@ -174,8 +174,8 @@ int threaded_alltoall_pairwise(const void* sendbuf,
     
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
     
     // Copy from GPU to CPU
     ierr += gpuMemcpy(cpu_sendbuf, sendbuf, total_bytes_s, gpuMemcpyDeviceToHost);
@@ -236,8 +236,8 @@ int threaded_alltoall_pairwise(const void* sendbuf,
 
     ierr += gpuMemcpy(recvbuf, cpu_recvbuf, total_bytes_r, gpuMemcpyHostToDevice);
 
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
 
     return ierr;
 }
@@ -263,8 +263,8 @@ int threaded_alltoall_nonblocking(const void* sendbuf,
 
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
 
     int ierr = 0;
     ierr += gpuMemcpy(cpu_sendbuf, sendbuf, total_bytes_s, gpuMemcpyDeviceToHost);
@@ -330,8 +330,8 @@ int threaded_alltoall_nonblocking(const void* sendbuf,
 } 
 
     ierr += gpuMemcpy(recvbuf, cpu_recvbuf, total_bytes_r, gpuMemcpyHostToDevice);
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
 
     return ierr;
 }
diff --git a/src/heterogeneous/gpu_alltoallv.c b/src/heterogeneous/gpu_alltoallv.c
@@ -144,8 +144,8 @@ int copy_to_cpu_alltoallv(alltoallv_ftn f,
 
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
 
     // Copy from GPU to CPU
     ierr += gpuMemcpy(cpu_sendbuf, sendbuf, total_bytes_s, gpuMemcpyDeviceToHost);
@@ -157,8 +157,8 @@ int copy_to_cpu_alltoallv(alltoallv_ftn f,
     // Copy from CPU to GPU
     ierr += gpuMemcpy(recvbuf, cpu_recvbuf, total_bytes_r, gpuMemcpyHostToDevice);
 
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
     
     return ierr;
 }
@@ -286,8 +286,8 @@ int threaded_alltoallv_pairwise(const void* sendbuf,
 
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
 
     // Copy from GPU to CPU
     ierr += gpuMemcpy(cpu_sendbuf, sendbuf, total_bytes_s, gpuMemcpyDeviceToHost);
@@ -348,8 +348,8 @@ int threaded_alltoallv_pairwise(const void* sendbuf,
 
     ierr += gpuMemcpy(recvbuf, cpu_recvbuf, total_bytes_r, gpuMemcpyHostToDevice);
 
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
 
     return ierr;
 }
@@ -387,8 +387,8 @@ int threaded_alltoallv_nonblocking(const void* sendbuf,
 
     char* cpu_sendbuf;
     char* cpu_recvbuf;
-    cudaMallocHost((void**)&cpu_sendbuf, total_bytes_s);
-    cudaMallocHost((void**)&cpu_recvbuf, total_bytes_r);
+    gpuMallocHost((void**)&cpu_sendbuf, total_bytes_s);
+    gpuMallocHost((void**)&cpu_recvbuf, total_bytes_r);
 
     // Copy from GPU to CPU
     ierr += gpuMemcpy(cpu_sendbuf, sendbuf, total_bytes_s, gpuMemcpyDeviceToHost);
@@ -455,8 +455,8 @@ int threaded_alltoallv_nonblocking(const void* sendbuf,
 
     ierr += gpuMemcpy(recvbuf, cpu_recvbuf, total_bytes_r, gpuMemcpyHostToDevice);
 
-    cudaFreeHost(cpu_sendbuf);
-    cudaFreeHost(cpu_recvbuf);
+    gpuFreeHost(cpu_sendbuf);
+    gpuFreeHost(cpu_recvbuf);
 
     return ierr;
 }