Skip to content

Commit

Permalink
Merge pull request #564 from chaithyagr/fix_spreadinterponly
Browse files Browse the repository at this point in the history
Support for `gpu_spreadinterponly`=1
  • Loading branch information
ahbarnett authored Oct 8, 2024
2 parents 5f019b0 + 3d85c59 commit cabda72
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 116 deletions.
5 changes: 1 addition & 4 deletions docs/c_gpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove

**gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]

Diagnostic options
~~~~~~~~~~~~~~~~~~

**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms!
Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))


Algorithm performance options
Expand Down
1 change: 1 addition & 0 deletions docs/error.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
18 size of bins for subprob/blockgather invalid
19 GPU shmem too small for subprob/blockgather parameters
20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
23 invalid upsampfac set while using gpu_spreadinterponly mode.

When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.

Expand Down
140 changes: 76 additions & 64 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_plan->ms = nmodes[0];
d_plan->mt = nmodes[1];
d_plan->mu = nmodes[2];
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
if (d_plan->opts.debug) {
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
}
} else {
d_plan->opts.gpu_spreadinterponly = 1;
}
Expand Down Expand Up @@ -153,7 +155,13 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
}
}

if (d_plan->opts.gpu_spreadinterponly) {
// XNOR implementation below with boolean logic.
if ((d_plan->opts.upsampfac !=1) == (type != 3)) {
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
}
}
/* Setup Spreader */
if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
// can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
Expand Down Expand Up @@ -254,70 +262,74 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
} break;
}

cufftHandle fftplan;
cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};

cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
inembed[0], cufft_type<T>(), batchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};

cufft_status =
cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
inembed[0] * inembed[1], cufft_type<T>(), batchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};

cufft_status = cufftPlanMany(
&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
} break;
}
// We dont need any cuFFT plans or kernel values if we are only spreading / interpolating
if(!d_plan->opts.gpu_spreadinterponly) {
cufftHandle fftplan;
cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};

cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
inembed[0], cufft_type<T>(), batchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};

cufft_status =
cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
inembed[0] * inembed[1], cufft_type<T>(), batchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};

cufft_status = cufftPlanMany(
&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
} break;
}

if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
cufftGetErrorString(cufft_status));
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
cufftGetErrorString(cufft_status));
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}
cufftSetStream(fftplan, stream);

d_plan->fftplan = fftplan;

// compute up to 3 * NQUAD precomputed values on CPU
T fseries_precomp_phase[3 * MAX_NQUAD];
T fseries_precomp_f[3 * MAX_NQUAD];
thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
fseries_precomp_phase, d_plan->spopts);
if (d_plan->dim > 1)
onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
if (d_plan->dim > 2)
onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
fseries_precomp_phase + 2 * MAX_NQUAD,
d_plan->spopts);
// copy the precomputed data to the device using thrust
thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
}
cufftSetStream(fftplan, stream);

d_plan->fftplan = fftplan;

// compute up to 3 * NQUAD precomputed values on CPU
T fseries_precomp_phase[3 * MAX_NQUAD];
T fseries_precomp_f[3 * MAX_NQUAD];
thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
fseries_precomp_phase, d_plan->spopts);
if (d_plan->dim > 1)
onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
if (d_plan->dim > 2)
onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
fseries_precomp_phase + 2 * MAX_NQUAD,
d_plan->spopts);
// copy the precomputed data to the device using thrust
thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;

}
finalize:
if (ier > 1) {
Expand Down
3 changes: 2 additions & 1 deletion include/finufft_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ enum {
FINUFFT_ERR_INSUFFICIENT_SHMEM = 19,
FINUFFT_ERR_NUM_NU_PTS_INVALID = 20,
FINUFFT_ERR_INVALID_ARGUMENT = 21,
FINUFFT_ERR_LOCK_FUNS_INVALID = 22
FINUFFT_ERR_LOCK_FUNS_INVALID = 22,
FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID = 23,
};
#endif
41 changes: 26 additions & 15 deletions src/cuda/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// this is needed
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
stream))))
return ier;

// Step 1: Spread
if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
// if spreadonly, skip the rest

if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -97,20 +103,25 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;

// Skip steps 1 and 2 if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
}

Expand Down
35 changes: 23 additions & 12 deletions src/cuda/2d/cufinufft2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
Expand All @@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
// Step 1: Spread
if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;

// if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -100,19 +106,24 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
// Skip steps 1 and 2 if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
}

Expand Down
40 changes: 25 additions & 15 deletions src/cuda/3d/cufinufft3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
Expand All @@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

// Step 1: Spread
if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;

// Step 2: FFT

// if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
Expand Down Expand Up @@ -94,20 +100,24 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
// Skip steps 1 and 2 if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
}
// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
}

Expand Down
7 changes: 4 additions & 3 deletions src/cuda/memtransfer_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
// Fixes a crash whewre the plan itself is deleted before the stream
const auto stream = d_plan->stream;

CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
// Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk)
if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3)
CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);

CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
Expand Down
Loading

0 comments on commit cabda72

Please sign in to comment.