This repository has been archived by the owner on Aug 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrasp.cu
363 lines (298 loc) · 10.2 KB
/
grasp.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
/*
* C/CUDA implementation of GRASP.
* Emma ????, Felix Moody, & Julien Rabinow
* Fall 2014-Spring 2015
*/
/* System headers */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <math.h> // for square root
#include <time.h> // for benchmarking
#include <complex.h> // for complex double type and operations
/* CUDA headers */
#include <cuda_runtime.h>
#include "cublas_v2.h" // CUBLAS
#include <cuComplex.h> // for cuDoubleComplex type and operations
/* Our headers */
#include "matrix.h" // host and device matrix metadata types
#include "cudaErr.h" // cuda and cublas error handlers
#include "TVTemp.h" // total variate temporal operator
#include "multicoilGpuNUFFT.cpp" // multicoil nonuniform FFT operator
/* CORRECT LINKING DEMO */
//#include <cuda_utils.hpp>
typedef struct {
matrixC * read; // k space readings
int num_spokes; // spokes per frame
int num_frames; // frames in data
double lambda; // trade off control TODO: between what?
double l1Smooth; // TODO: find out what this does
int num_iter; // number of iterations of the reconstruction
cublasHandle_t handle; // handle to CUBLAS context
} param_type;
#if 0
__global__ void elementWiseMultBySqrt(cuDoubleComplex * kdata, double * w) {
// We should only have to compute the squares of the elements of w
// one time and use the result for all slices of kdata
int i = threadIdx.x * blockIdx.x;
int j = blockIdx.y;
cuDoubleComplex sqrtofelement = make_cuDoubleComplex(sqrt(w[i]), 0);
// possible overflow error with cuCmul (see cuComplex.h)
kdata[j] = cuCmul(kdata[j], sqrtofelement); // WARNING
}
#endif
/*
* l1norm
*/
void l1norm(matrixC * traj,
matrixC * sens,
matrixC * read,
matrix * comp,
param_type * param) {
print_matrixC(traj, 0, 20);
print_matrixC(sens, 0, 20);
print_matrixC(read, 0, 20);
print_matrix(comp, 0, 20);
exit(EXIT_SUCCESS);
}
/*
* Sort data into time series
* It'd be cool if this was more general, but for now it's very explicit
*/
void make_time_series(matrixC * traj, matrixC * read, matrix * comp, param_type * param) {
// Since for traj and comp we're just splitting the last dimensions
// we get away with just reindexing the same underlying data,
traj->dims[1] = param->num_spokes;
traj->dims[2] = param->num_frames;
comp->dims[1] = param->num_spokes;
comp->dims[2] = param->num_frames;
// But read requires reordering the data
// allocate new matrix read_rs
size_t read_ts_dims[MAX_DIMS] = { read->dims[0],
param->num_spokes,
read->dims[2],
param->num_frames };
matrixC * read_ts = new_matrixC(read_ts_dims, host);
// loop over the first entries of the columns of read
size_t * read_coord;
size_t read_ts_coord[MAX_DIMS];
size_t read_ts_idx;
for (size_t i = 0; i < read->num; i += read->dims[0]) {
// convert read index to read coordinate
read_coord = I2C(i, read->dims);
// apply slicing operations
read_ts_coord[0] = read_coord[0];
read_ts_coord[1] = read_coord[1] % param->num_spokes;
read_ts_coord[2] = read_coord[2];
read_ts_coord[3] = read_coord[3] / param->num_spokes;
// convert read_ts coordinate to read_ts index
read_ts_idx = C2I(read_ts_coord, read_ts->dims);
// copy column
memcpy(&(read_ts->data[read_ts_idx]),
&(read->data[i]),
read->dims[0]*read->size);
}
// reassign read pointer to time series and free old data
free_matrixC(read);
*read = *read_ts;
}
/*
* Normalize a complex matrix on host in place so maximum modulus is 1
* This is "normalize" as in peak normalization in audio
* and video processing, not like in linear algebra. In the
* former we uniformly scale entries so that the greatest
* entry equals some value (in this case 1). In the latter
* we uniformly scale entries so that *the norm* of the
* matrix--treated as a vector--equals 1 (and always 1)
*
*/
void normalize(matrixC * mat) {
// Make sure we get a host matrix
if (mat->location == device) {
printf("Error: normalize only for host matrices\n");
}
// Find maximum modulus
double max_mod = 0;
for (size_t i = 0; i < mat->num; i++) {
if (cuCabs(mat->data[i]) > max_mod) {
max_mod = cuCabs(mat->data[i]);
}
}
// Scale entries by maximum modulus
for (size_t i = 0; i < mat->num; i++) {
mat->data[i] = cuCdiv(mat->data[i], make_cuDoubleComplex(max_mod, 0.0));
}
/*
This is (probably nonworking) parallel version, but since it's only run once
it's probably not worth it to transfer to device and back
// sens=sens/max({|x|: x entry in sens})
// scale entries of sens so that maximum modulus = 1
size_t max_mod_idx;
cuDoubleComplex max_mod_num;
cublasErrChk(cublasIzamax(param->handle, mat->num, mat->data, 1, &max_mod_idx));
cudaErrChk(cudaMemcpy(&max_mod_num,
&(mat->data[max_mod_idx]),
mat->size, cudaMemcpyDeviceToHost));
const double inv_max_mod = 1/cuCabs(max_mod_num);
cublasErrChk(cublasZdscal(handle, b1.t, &inv_max_mod, b1.d, 1));
*/
}
/*
* Load data from file and save to array structs
* (in this case the data is from a matlab .mat file read by a custom
* script "convertmat.c" using the matio library
* and then directly written to files by fwrite)
*/
void load_data(matrixC ** traj,
matrixC ** sens,
matrixC ** read,
matrix ** comp,
param_type * param) {
// Input data size (based on the k space readings matrix)
// 1st dim: number of samples per reading per coil
// 2nd dim: number of readings
// 3rd dim: number of coils
// TODO: should these be constants?
size_t dims[MAX_DIMS] = {768, 600, 12};
// Make auxillary matrix data sizes
size_t dims_sens[MAX_DIMS] = {dims[0] / 2, dims[0] / 2, dims[2]};
size_t dims_no_coils[MAX_DIMS] = {dims[0], dims[1]};
// allocate matrices on host
*traj = new_matrixC(dims_no_coils, host);
*sens = new_matrixC(dims_sens, host);
*read = new_matrixC(dims, host);
*comp = new_matrix(dims_no_coils, host);
// open matrix files
// these were pulled from liver_data.mat by matio and convertmat
//FILE * meta_file = fopen("./liver_data/metadata", "rb");
FILE * traj_file = fopen("./liver_data/k.matrix", "rb");
FILE * sens_file = fopen("./liver_data/b1.matrix", "rb");
FILE * read_file = fopen("./liver_data/kdata.matrix", "rb");
FILE * comp_file = fopen("./liver_data/w.matrix", "rb");
// load matrices onto host
fread((*traj)->data, (*traj)->size, (*traj)->num, traj_file);
fread((*sens)->data, (*sens)->size, (*sens)->num, sens_file);
fread((*read)->data, (*read)->size, (*read)->num, read_file);
fread((*comp)->data, (*comp)->size, (*comp)->num, comp_file);
// copy matrices to device
toDeviceC(*traj);
toDeviceC(*sens);
toDeviceC(*read);
toDevice(*comp);
}
/* Just preprocessing, no GPU
* All the good stuff is in CSL1Nlg()
*/
int main(int argc, char **argv) {
// Input data metadata structs (defined in matrix.h)
matrixC * traj; // trajectories through k space (k)
matrixC * sens; // coil sensitivities (b1)
matrixC * read; // k space readings (kdata)
matrix * comp; // density compensation (w)
// Reconstruction parameters
param_type * param = (param_type *)malloc(sizeof(param_type));
param->num_spokes = 21; // spokes (i.e. readings) per frame (Fibonacci number)
param->num_iter = 8; // number of iterations of the reconstruction
cublasErrChk(cublasCreate(&(param->handle))); // create cuBLAS context
// Load data
load_data(&traj, &sens, &read, &comp, param);
// Emma's l1norm testing function
if (false) {
l1norm(traj, sens, read, comp, param);
}
// normalize coil sensitivities
normalize(sens);
// multiply readings by density compensation
// TODO: write this function
// crop data so that spokes divide evenly into frames with none left over
param->num_frames = read->dims[1] / param->num_spokes;
size_t new_dims_read[MAX_DIMS] = {read->dims[0],
param->num_frames*param->num_spokes,
read->dims[2] };
size_t new_dims_no_coils[MAX_DIMS] = { new_dims_read[0],
new_dims_read[1] };
traj = crop_matrixC(traj, new_dims_no_coils);
read = crop_matrixC(read, new_dims_read);
comp = crop_matrix(comp, new_dims_no_coils);
// sort into time series
// TODO: get this working
make_time_series(traj, read, comp, param);
// gpuNUFFT operator
if (false) {
int kernel_width = 3;
int sector_width = 8;
double oversample_ratio = 1.25;
createMulticoilGpuNUFFTOperator(traj,
comp,
sens,
kernel_width,
sector_width,
oversample_ratio);
}
// print matrices
printf("\n----K-space trajectories aka traj aka k----\n");
print_matrixC(traj, 0, 20);
printf("\n----Coil sensitivities aka sens aka b1----\n");
print_matrixC(sens, 0, 20);
printf("\n----K-space readings aka read aka kdata----\n");
print_matrixC(read, 0, 20);
printf("\n----Density compensation aka comp aka w----\n");
print_matrix(comp, 0, 20);
// WORKING UP TO HERE
/* CORRECT LINKING DEMO */
//bindTo1DTexture("symbol", NULL, 0);
// GPU block and grid dimensions
/*
int bt = 512; // max threads per block total
int bx = 512; // max threads per block x direction
int by = 512; // max threads per block y direction
int bz = 64; // max threads per block z direction
int gx = 65535;
int gy = 65535;
int gz = 65535;
*/
/*
// for ch=1:nc,kdata(:,:,ch)=kdata(:,:,ch).*sqrt(w);endc
// i.e. multiply each of the 12 slices of kdata element-wise by sqrt(w)
dim3 numBlocks((kdata.x*kdata.y)/bt, kdata.z);
elementWiseMultBySqrt<<<numBlocks, bt>>>(kdata.d, w.d);
*/
/*
// %%%%% multicoil NUFFT operator
param.E=MCNUFFT(ku,wu,b1);
*/
/*
// %%%%% undersampled data
param.y=kdatau;
// clear kdata kdatau k ku wu w
*/
/*
// %%%%% nufft recon
// ' := conjugate transpose; * := matrix multiplication
// ' and * are overloaded, defined in @MCNUFFT
// what's the order of operations, or does it matter?
mat3DC recon_nufft=param.E'*param.y;
*/
/*
//param.lambda = 0.25*max(abs(recon_nufft(:)));
*/
// fprintf('\n GRASP reconstruction \n')
// long starttime = clock_gettime(CLOCK_MONOTONIC, tv_nsec);
// mat3DC recon_cs=recon_nufft;
// for (i = 0; i < 4; i++) {
// recon_cs = CSL1NlCg(recon_cs,param);
// }
// long elapsedtime = (clock_gettime(CLOCK_MONOTONIC, tv_nsec) - starttime)/1000000;
// recon_nufft=flipdim(recon_nufft,1);
// recon_cs=flipdim(recon_cs,1);
// destroy cuBLAS context
cublasDestroy(param->handle);
// free memory
free_matrixC(traj);
free_matrixC(sens);
free_matrixC(read);
// TODO: find out why the following line segfaults
//free_matrix(comp);
return 0;
}