From 608e7a5c054e86cf56e3cfabb51112b3e9d40e5d Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:22:09 -0400 Subject: [PATCH 01/13] //my dev_idata changed...have no idea why. pointarry add --- src/main.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 675da35..90ec7b6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,42 +26,43 @@ int main(int argc, char* argv[]) { printf("****************\n"); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; + a[SIZE - 1] = 0;//a[0]-a[size-1]:number=size printArray(SIZE, a, true); zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printArray(SIZE, b, true); + zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); + StreamCompaction::CPU::scan(NPOT, c, a);//253 printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - + ///////////////////////////////////////////////////// zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -112,12 +113,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + getchar(); } From 024b41245628f7149e26d57cdada41b05b30eaf0 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:23:07 -0400 Subject: [PATCH 02/13] kernel function complete n Please enter the commit message for your changes. Lines starting --- stream_compaction/common.cu | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..5893e1b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -22,17 +22,41 @@ namespace Common { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ -__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int k = threadIdx.x; + + if (idata[k] == 0)bools[k] = 0; + else bools[k] = 1; + /*for (int i = 0; i < n; i++){ + if (idata[i] == 0) bools[i] = 0; + else { + bools[i] = 1; + } + }*/ + } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ -__global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { +__global__ void kernScatter(int n, int *odata,const int *idata, const int *bools, const int *indices) { + //last, dev_odata, dev_idata, dev_bool, dev_boolb + //indices[i]={0,1,2,3,4},n is the muber of indices // TODO + /*for (int i = 0; i < n; i++){ + if (bools[i] == 1) + { + odata[indices[i]] = idata[i]; + } + }*/ + //for (int i = 0; i < n; i++){ odata[i] = 0; } + int k = threadIdx.x; + if (bools[k] == 1){ + int t = indices[k];// + odata[t] = idata[k]; + } } } From 3b1aa2f8e5356548131c92e3f3355a98131e69d2 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:23:43 -0400 Subject: [PATCH 03/13] efficient --- stream_compaction/efficient.cu | 140 ++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 3 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..2a19417 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include +int *dev_A1; +int *dev_B1; namespace StreamCompaction { namespace Efficient { @@ -11,9 +14,73 @@ namespace Efficient { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ +__global__ void Uscan(int p1, int p2, int *od){ + int thid = threadIdx.x*2*p1; + //od[thid + p1 - 1] = id[thid + p1 - 1]; + //od[thid + p2 - 1] = id[thid + p2 - 1]; + + od[thid + p2 - 1] += od[thid + p1 - 1]; + + } +__global__ void put0(int * odata, int n) + { + odata[n - 1] = 0; + } + +__global__ void Dscan(int p1,int p2,int *od){ + + int thid = threadIdx.x*2*p1; + + //od[thid + p1 - 1] = id[thid + p1 - 1]; + //od[thid + p2 - 1] = id[thid + p2 - 1]; + //if (thid == n) { od[n - 1] = 0; } + int t = od[thid +p1 - 1];// + od[thid + p1 - 1] = od[thid + p2 - 1]; + od[thid + p2 - 1] += t; + } + + + +void init(int n, const int *hst_A){ + + int _size = n*sizeof(int); + cudaMalloc((void**)&dev_A1, _size); + cudaMemcpy(dev_A1, hst_A, _size, cudaMemcpyHostToDevice); + + +} + + void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + + //dev_A1,dev_B1 + int p1,p2; + int num; + if (n % 2 != 0) + { + num = ilog2ceil(n); + num = pow(2, num); + } + else num = n; + int *_idata = new int[num]; + init(num, idata); + for (int d = 0; d <= ilog2ceil(num) - 1; d++){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Uscan << <1, num >> >(p1, p2, dev_A1); + } + put0 << <1, 1 >> >(dev_A1, num); + for (int d = ilog2ceil(num) - 1; d >= 0; d--){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Dscan << <1, num >> >(p1, p2, dev_A1); + + } + cudaMemcpy(odata, dev_A1, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source, + cudaFree(dev_A1); + + printf("3.1\n"); } /** @@ -24,10 +91,77 @@ void scan(int n, int *odata, const int *idata) { * @param odata The array into which to store elements. * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. - */ + */ +int *dev_idata; +int *dev_odata; +int *dev_indices; +int *dev_bool; +int *dev_boolb; + int compact(int n, int *odata, const int *idata) { // TODO - return -1; + int num; + if (n % 2 != 0) + { + num = ilog2ceil(n); + num = pow(2, num); + } + else num = n; + + int _size = num*sizeof(int); + + cudaMalloc((void**)&dev_bool, _size); + cudaMalloc((void**)&dev_boolb, _size); + + cudaMalloc((void**)&dev_idata, _size); + cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice); + + int p1, p2; + int hst; + int last; + //step 1 + Common::kernMapToBoolean <<< 1, n >>>(n, dev_bool, dev_idata); + cudaMemcpy(dev_boolb, dev_bool, _size, cudaMemcpyDeviceToDevice); + //cudaMemcpy(&hst, &dev_idata[6],sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst; + //Step 2 + for (int d = 0; d <= ilog2ceil(num) - 1; d++){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Uscan <<<1, num >> >(p1, p2, dev_boolb);//change end to n + } + put0 <<<1, 1 >> >(dev_boolb, num); + //cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst << "ss1"; + for (int d = ilog2ceil(num) - 1; d >= 0; d--){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Dscan <<<1, num >> >(p1, p2, dev_boolb); + } + cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice); + //my dev_idata changed...have no idea why. + ////////////???????????????????????///////////// + //cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst << "ss2"; + //Step 3 : Scatter + //cudaMemcpy(&hst, &dev_idata[2],sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst; + cudaMemcpy(&last, &(dev_boolb[num - 1]), sizeof(int), cudaMemcpyDeviceToHost); + cudaMalloc((void**)&dev_odata, last*sizeof(int)); + + Common::kernScatter <<<1, num >> >(last, dev_odata, dev_idata, dev_bool, dev_boolb); + + cudaMemcpy(odata, dev_odata, last*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_bool); + cudaFree(dev_boolb); + cudaFree(dev_odata); + + cudaFree(dev_idata); + + + printf("3.2\n"); + return last; } } From b913fc07ce11204dccd5a0b2c845530bf1345782 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:25:01 -0400 Subject: [PATCH 04/13] efficient --- stream_compaction/efficient.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..4769a7d 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -3,7 +3,7 @@ namespace StreamCompaction { namespace Efficient { void scan(int n, int *odata, const int *idata); - + void init(int n,const int*b); int compact(int n, int *odata, const int *idata); } } From 904b9aa4e201a99d580ea11e719e9c465ccad67f Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:25:22 -0400 Subject: [PATCH 05/13] naive --- stream_compaction/naive.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..0cdba88 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -3,5 +3,8 @@ namespace StreamCompaction { namespace Naive { void scan(int n, int *odata, const int *idata); + void init(int *hst_idata,int *hst_odata,int n); + //int *dev_A; + //int *dev_B; } } From e5566e8b7c68c7d1ab059c28e0c2cdaa7d82e974 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:25:26 -0400 Subject: [PATCH 06/13] naive --- stream_compaction/naive.cu | 67 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..83f3742 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,7 +2,10 @@ #include #include "common.h" #include "naive.h" - +#include +int *dev_A; +int *dev_B; +int *dev_C; namespace StreamCompaction { namespace Naive { @@ -11,9 +14,69 @@ namespace Naive { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ +__global__ void Nscan(int n, int logn, int *Ain, int*Bout,int *Ctemp){//in;out;temp + + int thid = threadIdx.x; + int offset; + Bout[0] = 0; + Ctemp[0] = 0; + for (int j = 0; j < n-1; j++){ + Ctemp[j + 1] = Ain[j]; + } + + //Ctemp[thid] = (thid > 0) ? Ain[thid - 1] : 0; + for (int d = 1; d <= logn; d++){ + offset = 2; + + if (d == 1)offset = 1; + if (d == 2)offset = 2; + else + for (int i = 1; i < d-1; i++){ + offset *= 2; + } + if (thid >= offset)//pow(2,d-1){d=1,off=1}{d=2,off=2}{d=3,off=4}off=pow(2,d-1){d=4,offset=8} + Ctemp[thid] += Ctemp[thid - offset]; + } + Bout[0] = 0; + Bout[thid] = Ctemp[thid]; + + } +void init(int *hst_A, int *hst_B,int n){ + + int _size = n *sizeof(int); + cudaMalloc((void**)&dev_A, _size); + cudaMemcpy(dev_A, hst_A, _size, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_B, _size); + cudaMemcpy(dev_B, hst_B, _size, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_C, _size); + } void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + int num; + if (n % 2 != 0) + { + num = ilog2ceil(n); + num = pow(2, num); + } + else num = n; + int *_idata=new int[num]; + for (int i = 0; i < num; i++){ + _idata[i] = idata[i]; + } + init(_idata, odata, num); + + //std::cout << ilog2ceil(4) << ilog2ceil(5);//2,3; + int logn = ilog2ceil(num); + Nscan <<< 1, num >> >(num,logn,dev_A, dev_B,dev_C); + + cudaMemcpy(odata, dev_B, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source, + cudaFree(dev_A); + cudaFree(dev_B); + cudaFree(dev_C); + + printf("2.1"); } } From 45d1f2ed0fe5d5c8b3d8d1946c8d8da1a982d4a6 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:25:51 -0400 Subject: [PATCH 07/13] thrust function --- stream_compaction/thrust.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..ee4c69c 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,7 +16,24 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector dv_in,dv_out; + for (int i = 0; i < n; i++){ + dv_in.push_back(idata[i]); + dv_out.push_back(0); + } + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::copy(dv_out.begin(), dv_out.end(), odata); + + + printf("4.1"); + + } + + } + } + From f58a2efb25dfc143508fcff3ae1a83576a4ccdb8 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 19:26:18 -0400 Subject: [PATCH 08/13] cpu function --- stream_compaction/cpu.cu | 119 ++++++++++++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 27 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..2c9454f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,36 +1,101 @@ #include #include "cpu.h" - +#include +#include namespace StreamCompaction { -namespace CPU { + namespace CPU { -/** - * CPU scan (prefix sum). - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + /** + * CPU scan (prefix sum). + */ + void scan(int n, int *odata, const int *idata) {//b,a + // TODO + odata[0] = 0; + if (n > 1){ + for (int i = 1; i < n; i++){ + odata[i] = idata[i - 1] + odata[i - 1]; + } + } + std::cout << "1.1"; -/** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + } -/** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + // TODO + //int *p = odata; + + int count = 0; + for (int i = 0; i < n; i++){ + if (idata[i] != 0){ + odata[count] = idata[i]; + count++; + } + + } + std::cout << "1.2"< Date: Sun, 13 Sep 2015 22:40:40 -0400 Subject: [PATCH 09/13] readme --- README.md | 278 +++++++++++++++--------------------------------------- 1 file changed, 75 insertions(+), 203 deletions(-) diff --git a/README.md b/README.md index a82ea0f..e7440e0 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,83 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xinyue Zhu +* Tested on: Windows 10, i5-5200U @ 2.20GHz 8GB, GTX 960M -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. +### README +The results are marked by the number of requirements. ### Questions -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. +* GPU Scan implementations (not included in code)(Naive, Work-Efficient, and + Thrust) to the serial CPU version of Scan. + GPU/CPU method: + | Method | time1 (ms) |time2 (ms)| + |CPUscan|0|0| + | Thrust: | 0.002304 |0.002304| + | Naive | 0.002304 |0.002304| + |Work-Efficient compact|0.007|0.007| + |Work-Efficient scan|0.006|0.007| + (time1 is power of two, time2 is non power of two) + ![](graph.png) +*the time line of thrust is seperate. It runs evey few seconds. + +* The CudaMalloc function take over 167439 us. Over half of the time.This I/O is bottlenenecks. + The I/O time of deferent method is different. Because some methods needs to allocate more places in device. + + +*Output: + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== +1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] +==== cpu scan, non-power-of-two ==== +1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== naive scan, power-of-two ==== +2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== naive scan, non-power-of-two ==== +2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== work-efficient scan, power-of-two ==== +3.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== work-efficient scan, non-power-of-two ==== +3.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== thrust scan, power-of-two ==== +4.1 passed +==== thrust scan, non-power-of-two ==== +4.1 passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== +1.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== +1.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed +==== cpu compact with scan ==== +1.3 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== +3.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, non-power-of-two ==== +3.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. From a6c703f1fc03d02133d14a0a1f2a540e7398008f Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 22:42:15 -0400 Subject: [PATCH 10/13] graph --- graph.png | Bin 0 -> 21825 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 graph.png diff --git a/graph.png b/graph.png new file mode 100644 index 0000000000000000000000000000000000000000..acf2667943314257f3971904ef00f6e02b4d9448 GIT binary patch literal 21825 zcmd?RcUV)~`YsxZ!a@|V0D^$ZQj{tppmdRqs&pEu zL3{#XQNLThmqsN`k#05Y`fYqjN9>BF{Cy>XO7$jte{W0h0yMO%S{e!(1x75Ggx9Z>aiRo=pQ7VHS9115_*yu zOcNxM^C|-baz_35|ECuy-@Uh^8(#P=jlI!(d(kk^48qIlu1kVI+DKnS(ys5$Ol&8w zwIH@z=GyFbe5w{G^C*39_-b;WjXyrV+6!-apTl)}b9r*EB8}z|xKP&8+unY?%Hz(R zKRt&_Gbpsvr%$tyZc$SaJ6pXNNMk71RR+^{U%q@LR zX_d>3AS!Mm)vJDM2D-DoCE6XlmL*iOBn)vqE5P0(8n`fAM!Z*fXSbV!W)L`UHKD+x zuffZ`h1# zFn5!8{qd$NIF~IuV}!&_Y`|-)th_jg`2(VS0(8jWUWdz4F12Q7Z~}e0|4!NV6(Nd& zs(m14lP0 zc>f7JG(D9g_$!mEo15t3I2J>+JZAG^Qf9_oeA0JkHfG1A4_QgGONqH5v)viQ3XP$KO`GpBLKp9*&yyU_He9g@c5Ya=YA^|LF0(5``Hu$> z>PSaIP4A1ms+N~+vGgg-PE9^TK5kIwxki~v>U2qUsZW|%e&ks&Z?VCs%ZY$YT4Zs>WYsk zJ(8Q3SF}FKEj%n&eWCuxhkXOItQZwbm@BfuPPwkhpVro}-R4`86hJN0i;H*>&Er1T zPMq+!p2w9qrv~JrV@S2j{_WK9MRM;V$$

I3{~yGkW4R8P`(ZEZb!$$KnJrRS7$6 zzo#|q&tj_@5IcNz;vd8T;_n+dDJ4fbd)D^4+# zeBTY<$0K>OEs=tSQWX1}zC=>I)2wtI?1W?a?DyGEM;&^7nLZUhv$Lj95<6EpQQ{7l z!bvra(o4acKOTIOX4J;Ui!(09(1v47rhRyAX4g$K{@Zf_@r5RrZ)!@Vr_ja8N1R;!uq zs%5ZHS&37%CoW;BmXMl@YO1GwL~L>p#uX9Xx-?q_ngEdxK^=w%BS4faXl>eU09&(_ z8)5aajg7u!k0QhOUJ3ljKWag5W2*Aekd%eq;cCW$vSsiiu zuJBna0Fvf1;91qx@$C8YYIf+Qi%sb&Sa!pn*5NaZV?9~g1PRRU4wWZlqG>M=aU*DM z!=JSTPTfQ@nA~qDX*FndG@p72w-qOMNnZTfQ2W?DZpt!-J*PyO*DEdc7W@i zfGd>Xamr5UHHym5TuUY1kjG!B5uY*KUzR4V)_2>+T#+m5+hZl-l96U<2foBXEnN>@ zqbETi_b&KDLY?ScBR)ZFJ}IKIChRVx(JB(=HD|XPSyT*etRE!4pPKse>PDM}c`IIe zf-+dM;g*OyXx#j1EG>Fv=Lv~gPf8AY%i`iTXj#+W=ucg&F(Y$Pf=<;$Hk0{R(#Kg`Qbwswqt*tLc-<2k;%tsPeow+h(B-tH*=lfd8T|UU z7gjo1*O{Ncet2F(%tOj*aFpvZ@uASRJy?|$p9r6g2Q;GWLehM`Mut5hO4M`Zp8O%9 z+1|zXP*wB(GlL1kVs8fL<$l7CGvs5I24J z#6?EG3R>jd`y?#6(~3JoTHTB}KZLqra{27uz{oD{`4+;{7W`Agr;|ZZ$GJ0hoDmz& z1oSxqnKi}F{kdc%7q0`RY=T31%>d?}lk6`0Yu`iEWavqa=i7{gcYv+}jCk|NaUV0> zJ5y=(&meQdI)Y#J=uk2^Lh{)(q)%N9aeHy5kJo3*gIeV zZEXqSWR^xq=;c?0v4w%B)zB6aL`|L;AQLz^brfQK1z=b;F2=I} z8fk*H0D&9^dHCF6cBl;R_5sM+OG7^fUbcoG`x*=ad9K1)K*HVkulGVA29MPKT_gXE z3qK(e1X9!na41YFmJtH!Hk+KB9KcFqe)r@*HsWfRt_vdEH5nNh-!6cAQUH<~edQBX z;tLQ=rT0XAfTQ2S@QZ5iiScCuK!8TfMzbF}QxyAVI+FXwu|Y)y?dQj?J>d{S3Q8T; z!JVxcR=D&`@A~y@8;qYxTa1uu>T>}_Q)O-)1_&=VdO`ik$>WcY@m^v6_~OM2D8O2$ z;@zE7DxAAa$yU`KZ>&VrU5bspP4}SzSjF86CZgn;R%luQ&4EKw;vqPNh1?JzjbF^ncJa~^f^>E?X8Tv#BS znbB%&0cHa2dylCsyP%HmS%DZAw8WMjS%eVv;6h6Omo+WQvO<=tHU?V&f&B{AWF)v}?k z?3$2?qF#*1xYhkG`+z^PyTely{c}5#<@2yXWYaEC7Q8$lq++dXdnT^h*sBb-vf_k$ zuEutSV5Oe}UCKi~>WOK$+xd`aL?Jfl&GjyeX*)F}^tO4Qtw2E9X8HB)bQ+*j8EaHT z^>S+k%(pFV`vV%fvhQpNae--$G)@(Eg|0A47J?;E=tQKZ^~TK?6x_c}bV{*<2mbL5@HhUdIz zx98DLFg_k!+m~fB`xc8g#S3CrE?TNKY<}`^`d2vZb{B+>uS+AUxR1( z?Wy>NF3Li=&olbQ0paNkxg@Q zGK=|(`i*w^J_oyydbBcSfY9rZ=p>{nO;b;TQ<_dvE8%NL<|wju`gZTuBHb?Fl|l{0 zlD&74DvL+g5<#emEHF8FK~U-bU9Z4|`^RjYe?}Zt#7^)iJ&J5;{0wInse4c7hKFUq z=BdtvqtOL6+$*CcXYm`VWaqTdjKkl`*^;@Gq%?=?4yvH`b`fkh{U*qSX?MGX@gL~< zK8dU$GqSh2Kdp{1g!PaM9pC-NQb=!OW@y)9bVpElq5M49HyN)j9d~zVCSS*DS93B( zV+57aQ$CAcij1T;GR}pK`>uE|3>i^gk#jDrmd&8FdGTu~Bg;vB?qVk?_`IU8#9J;E zUopaDUj=z$r*pcV)K}zfZWZ2!J9)Q$>2W1|>d4#;RD%!V6m)!hbOR|x!;p8bQbZcI zmeZV$KEb>eKF`mlPzji?6#&+SOar=A)x@d{u2Bvl#8&t#|{Q_<5O zB6;4gl#Nx%At&E{VOZM|joq+AWuJgp}+PKf3IDAGMFg5AIJufZxpM=ikAgj3KgaW>g+_rJo_72|i zNE{VoSiK=X@uSyX-OLLW5-*85mrrjH$<>!JRbVI>uu@yg-TAKRg6OFWiEo;!d^D3qNwQ7y>DRsZ4b8EZMm*o9F0%l zG|s7WrHK(NFACpp4prw;3c6&%c5b_$6)=X9d)Xox9@@H12P63m&OC6AC@{J$8t2wO zs-#L7D1DRGa#IwtNT;jVq6!QO4c%9_-QgJLc_@(pKGA z_we2Uc~0#H_&2Picyq{BE%dc(g0)Nae)$z>Y%q#d)vSo{Ju{ zmhOdz?PU*MWdNIY$bE*pv#@H_ZMeNl_j>$6;Xe96C@Skbv*s5b*819C#r@S1bnIMd zCUs#U{c=TL)~C-^(hoeDyJr2Y)Z5>WT$Ca#Sf>n``T8?9F{cP@Opvg6`$l1E!w)|OkDM?KgXl4`*f)y;M3^qS2E}!WW#9H;LWoTv1Bmo zV%MQ*9X9SlX>b+gm8pRMg?)PQM&1K&K|sGEn{FU351vMpk(D*UQ};n2SMtXTFX5^- zYKxfIlPGy*_dX%kEn5_RTcTx< z?0ur;J~h-<8zZFc%mYW}c2SoZWR)3;^h|Bky9ZVc=7JUg#qJ4(gUPV}BXi!vVQ7>B zXgi!YSj!`qAKJ2|3~ay6Ld{ZlAZlXZq2ed?AOlGyoG0cR%wEs32=*hecBP( zwAw0>Yse6K>v$|<;}K~Aom9S4x!2yfFW31*x!%-q8LiqN+N<}lu|YoS9S@gW9H|iL zXm4*0^XU39!wJ56B@MlT@?)1!bcnq8bEXRYl(% zX=i=E3C}>&RDsb7H#7|8q;!WO&gGF6+9Q`E#9wEZp z5+rOERoP$8=Hkvjdn{XFW2(RVn0)hOl`B5FqRloW1RK8^UQ`q3l=>RW(C8*DCpTC_ zK6o`_Ux%f77OaAY-Y|{Q6U{Gg%5*}a-;vk!A#a=?-W|!@In@5Rv&XFKamSF|AE~II zj<#NZ8|QvY2=AYuYc3fH-A*ORDRJ|6a6O`Ym!X%|R*QLyaJeMfE4M%KdY`tF*S9Vb zW(ai(b|R2-N&q_=K;uDiROb{>#1NrCAD5a5cu-a6A$=ujsaUNxH%0v8Td|*%vP_O+ zg9E%ES*ANrgaWUOE(_PZ5d|@s^CP4Bla<-(dsrv#7-3E?Ty;sHU`e8Mn#=E^?K3}*uhs^R=T~H%`X|^Y8s7`PQHpp_+ve+6k|A@5?MkSN%swL}6-`-y8M)<4+ zQ@RMLZj``BJ=EPi#Yct^-rJzi!_t+H!WOWdnZ9jCNae|-Aw)x#_UW5-wT!P3E4Q8} z_imKb(E1vJpY>|r2kr`8cY~x;5Gd>gvY+W2@m&b#q;N19uQFI|8Efm%v2}bOH*$TsWLLj_aV3E##Z;EpL_;~-2c#PxnsL@|n=-kNv zP#?YW6jJmnMmvIO*5{Y^vc5Y3*p66MP-o7nDQrxsFQiReqBfdFYRN@tDweo+{e z7V}Ob(`dXO3Paog`vyR}uFcK+FjV+qg?_it3{$3qFnUdx2B;Ub-I+?jex%bw3m|R^ zmN*2JF4l+(YMbFaan&YWU1_Rz9U_)?c|WO`F2Dn+c&FV??PblHIkg93|GKy~SX1fg z4yJbGu(>&NC$C06Ql;Af9CInj7|4NzCdUW< zf3TG32>CB5^#A3WKexCK>iU1DDUh!`2{4h-1zZwAcz-GAC*Lk!qz;_hm z@4deL`GxS@k8d9bCd1WT6{i(OHr_vE0X6n)-4DPgepCnil>zgoSSE+3319wZ+(2{` z(R+Ko^0R)h_#MVV*%w1G21X@Divblit29-m73KN$5 z5;`jbR%hrUIx&Fx)5j+qW-&Z$Ig4>t9w~=J^s@W%^z*W`V>)qOkIXX;gMrQb*MOPx zD`3)@(cb2-M;WTeGwDlN?f6e}N*ajy-;$=w#BHr~YQ1XqgDY3Tl^go{HNl=4rmcRh zu!_aWmY6&l=gw?E+$1=Hn8{iD_%NE&#)Uun`SXozz#?m~Dg#z~Y?U>ur>e&67gNLl zIcNQAb%G$BVc|ZlPDc`8{Vn2Z%xc8Q*3-bL@lG^(O_VRO&_~YQ|myQq$~Co_)9G*Ixv%c8nl)X8pBLQnJgB8A16IO@xZ5~pzo~=qhAnYOvspU=#C*v z`KarOJ?XD!GpoD9%r~ZzUO?AodsG&<)EcVHSl~+zExmU43C~p)IMx!mjUbT`^EnXn zF$cTan}Iy`7B2Lky8U=QuS43?_%(T}b3MnOA<)gutq=|DAP~rG(em;#(KgWr{Lpnj z_@VodlG)!seC;5WE9=rij>%n{X+ZnFA+x3QVao66o<+|5Tkx5@kMQ_-E%IsIjTtlKw{RNVF1)Z3-Laz*1X!ljfmB22(-U@uC!P z*$`#z?;bd``%V$N4wEXfWV78#Ar#>{5}gaZ(EbyveFF**u@_)uI}5lWdp~S`ipg8< zRb()D`g5@VZRICX-T6XTyO%!stMsG<+#5(076>HnWUM+5-Tx!!J^prM32&tw6MFw2 zH5|~SK=}iutj*Df*lpx@^;pO$7RLxVcOA%Z|0u}{EzjzwC{$(Bg`kW|K_z*wG4nmc zE{4%}f2PI%I+m2c@y_yeF!X4^MKgL3l16~>s$>Emrif4zuR8J=LIJ7)B8H)7+nWcx!0tBv1fUEvN z8ka9Bp}nl_EQOd0cqjp;?2ym_AQM9K2OwAe;(=@Ne^RAm$zzlL_|cAm&_)q43Ng-`Gk&(2`R5%=L_zmi<;s^reQb1O8gu2{LlEm z%&h{!1VhYiezWCZzt;oT9BS#8bM1e>yX-%d22ZPU{2`AAQk8>%W2}8_6_4@HE3v8p zi2{SG2X=`*V1vTJz3EsK4EimAWn2MFbtR2s2Udl(pJ)HhaE}-1$pU1!b_phs)#$kU z?=nCv8l*|%JUxj5(B|`Vd>OKqS|Bf((-jv@o{|0ha@G|}2~F<0bov0$-yWW?QCU5J{{YuNcBm1m5gtOvB^rENJ;^zH7W(b!#qn%trB zuln4jE58y~_=!W`WTwvX`Tav1WLrD}CFL-cwcW7WZRbhT+x>nhX5BvzLeT(E7K!~3 zTl>${)83DNqVuuPQUbqmmi^&7 zTQ}2rrm_WjeRwUJp;#$uC~SB7g?Dzt3Xc}0RI1yK@hp4BY$~W=uP!$K65P!{Z?!bo z1oA*0e;_n$S2lF5b^N8j^QzNHq3KGV?J_2aIi3=rlplWhal5)iMu(*yW}r*Pq1cwc z;m1|k2X_?fccW2yyKQ!~CC71LQ%o>t?tyF}_*~ZFP*@ELu^zuGH4rA=0GHQuIT3!M z&+SL1=|h3{$D=N7j2;1Ax>JTMARk_F{+FoQOpmHYm_0TTe@x&m9mh;2nC1F=o`20! zTAt5z`FMCn#l6YqO2qCCokpt!`;f>TL*KyIy&q4X4o0kcqTIFuk3FTOG^{j3@on5f z;Ro%9>GJ~8)fFIEFdLZvVH@;ubZwQile*Re7|59Vwd?2~@yfY_!Cl>(|GFPn)gz1bcPmo)|hNs7~M*IzGhU=R*Kp9cgQigpeyX3&V zy3>49_eAy4n%NnCi6SheE-FM2Vt%$VmXSSn_B`K^baSQ3y{L3!4pFWZpT;lq8l@3m zhlfL;Wu6j$GP-)mBu0~?O5-Q8QfQwU7J$9W7l*z3P5sE&ZK{PW0?g}bwLqA-cMueM z0T9xJq%NKl=TUv4byXE8F7`>9x-H6rcmKT*S7s_Fxy(!4ib(Jz$OCp+>-ZktbL$C> zm?d-(FrWO%XwKXgXdZd)h{RJs`x0HOq6)fT{3J6(j2}5m`dQN?VEKZ^%gt3=r`3)E z$={UYwc%+-m&y7G|A~+&_;bC6ik#Vv)eZMuh><5*~fq zqHb`h8a;Q?lHR891JflVe!8br@?(z=|3lU$s-`(+wNsHW_(dYEcHg2pk%}YAz^Ey( zRRI_^07JXMU_bmvHl)blQFasZ)MB{HdQ>Pk-sprw7kTEHe6>PO1yH>PrSvS zn<|8lT&+T`OE%--Kf8I(5rVLOPBVXGzB58bX3Aj z*}N-x>y8)L5dT71z7|2c_S^74W?6vR_HXU*bTE-F*%iPPN0<;UO)6kwpg7oDUNzYU z=%=Vntt$30fQP_;USz3qX=>tRDPI9o*Z^{3e>f9h;Q+z%QwaUHz8UGeSqqX+RNrl` zug&dlRITL!**|axi^8bT8vQ3;@AL}D9~qYcy&7=WWa)Lb+xDZZ>%%XlQ2J0^Rw^`L z6hKfu^bIW1Nbh8^8;_^A_<+s+1u&>6d5`>d#uT-*#0crPJQu*^{&b595?~v@D&Aoz zb={MZy=axtl?QAM9ag3&4hB;+@Vj&djW29+m?j;;JJwk6mQepnClYNzLcm)fDc;Or zQr}e0`VwY1GBqUcF>GL>bJ$`R&Kl89vlu*wzIiXj zt+IS4O~jT+;lJer1eu~m&=fv_g;y>Lrk(s|&Qppz^m2uPa^8yv@m}dnb6@t^iDCD7 zVt&LEW(2i!4mz)6mcMZz-l0+cBM|fbNdw7xoeV{q6%-_H@*3}HCcGm#XwW82g*@%7 zL(;OkZHF-tGC7ZSI;H^4=fNry!GncQSc~4PvGl%vIla3{YbhLT z@kTh`#Vl&^(vq#bQ-a1_5xZVb*6N{iMz97rjq<#Wkv+1Bs4BtSs8eI0OvpYah6g`i zM+F3+cTu!jWE;X|<%OGDV))p-HnBd{%^uP<;QP2SMCv=k(#z}VU$FYnYqYWetqK@; zSr*S)_tLF`&|5Et4M!dI$mrEKNqmaH=IerO670d-@~r18=l4g}?RFeu_@Bl-m-Eq&EL#EoDWrmsaGMI* ztYIzoBC6R}Ss?>UQ|0HCmCc|(>#e=Ot@CdjrvILdDXU&PaG~{38KdJCXwQkL7P^g@Bte362`qVfOiM|C|x_jdy8wm6m_gs>*|u``&q zZ%wN3i`dPPFKZ>*U8Y?_p;Uj|2^VoQXIj();b> zlfJ)fdmDn-bDybup^3sG+nBN?vsSua> zp5Jai8FKJ%Fd{O|Z6zX@l|t?pp9H+<*S+?!oz zJM?nMZXJrc=iKLwg^^cU+S&w0Tjgzu-*#6*R~jmH=%b_cIA={=@rXhho}!QffBx>- zZ8|J|_6)ja!yu;r_H88P_>jQ*j~atBo_TQt*SVnam;-BVaYp9#}`cuMmN{ARcxaiA%+v6oXrj*Vg0=<9@SjDT9D>Q%B4R!?52aR^V=Sd zS}*tN&J_P~3p_!g96a2SeyR-+m=)b-OvQ|bT^jlkfgzmAUor`w=KAsyZzA!Y+B8$| zxpvrBaCpZGbedIPX<5T8O;~SGljGo-Tj^F;8&;!r!JttQHzp__oO`FevF|)ctL;~I z^U*Mej&=FmD(^tuz=3K+tgT-kzk=j02xj+}8bw)}0(nL+rbja6IN9`Y9}24)Ul;F; zYYH<{0zte@ug)rGv?a-sGr6}|EyQ!z^`E}h3NCd4=h5#rFm2a_C)%56LNHm z*47={tUv-AGOPR9G&krmAk4Kb}1W-^)PUi_mvc8qc42B?iwPz;{16>(xa z>l-dnUxA^eAONL3CSZ=T3Zcl z%_&}r{)%ljgT%IZroapQPxDjAy*Ku|^cU+{)_5&5oWq?RnyVJ}(6TR1yW7?VIH%$f zHI2nOvo*E|ipj%0r}jhl&5c@-&d@W;(jdSgccWR;dj@I*uUl2ALGE;P7E29&ANH!% zs}abMIZ(;R^7W}{X#T{CM0L;o{j}R86H@NpSlOaieEj>+65vsjy-3!CXpym%sNO6wvyuWDkI2emK_%kg)-JcRermJPYC+Vp1E?*QkO zn-{kD`hgm0O0JdVPWLKH=r(ok`S});UqxFhWoT+kUi2e*)npM%+euTQZ3*IlOSB6+ zaEx^qJ8Rk$P38f{AHB%C4_GBB(>5hjjY(i`2-wV>Hj&kp5?iUkj}qc}VN2yf_}S!M z_njGUsLjsY?!0pmupu4{Bg?98xibj`^5fyu#Q|AkLyJ!%wB{xEo`ZbO13sq`IEwi= zdcSA-KjdSuD0(mZ9pacV2Lap~AoSG4iA8<~KB-r{kKztO0cS5cddS9@t;krVK5{ab zvTRrs)~j%qXUMY}v&j4O{D<2j4=o2{mPbd@oJ6SKLmmcPoqGbfaUvWn1(4w+8r(nP zerC70*E=w{g3K@!m<~&iDy^n|PKd?qn>4_cSuQfr^$YVVS^_2O&-FPj7`s`r^RZZdX*hN=q7$<8VJh+v|)1+ zswsGfBPPe&cC&x1@rt`QF0Oulqb6^DqT1oi$1~!2)7v$le>`opFiZ1LZW#CAjBaX` zsMb4nt|_A0iAYN17*1HR?Ql7vDRA|(NOO6!>x$2D52bt6=0G!aU060XBgq~V`No&$ zGPj0J>QUD*|CtT548_xqS%5O#B@GeP=$3nc*%Sk3=>@2hj?mwqq91(tFWu(gtb|&Y z5^29d!tXX_IRHBj#82tF64`(`1LV7D)tKnlf8cUu4N!aj=avDdL)))s;Cpshf+ zuel)>2kbUK@D$?K)}ZUbpxVUy;XLV@f7=TDceW}LB>hS~_m--D7w~z%D{aX(D$qOG zA0837Y8SlRL_a05)+7bLH@@PW79tlir)yW}^{x-uHF!{ebYJ1f0iagWm0wqa zqq~4gYET6(?t4pH;N(T`#v%^*-OFG^k@a%A>J~KKNzYDoOW^#*0H(B6lT-D0z`m9+Y7?N!U7>?jje}7@-kIE!?b^ARzP1b*9l5yN#cHl^kfz=&$clF0d4k@%(z21e1?_bYEwQTYAQYM>M z3~vo+HW^PiP0mB2zo*}EaypF%iP5+1WL?u`9h!KM{Pd`W$boD=D}~7^+ouX80}9}{ zs52~WepOFJJ1*kY6Y`jEm%82lQZ^0cL&~R!>z6NxYIH>aEmP_zrAsLp?|n1h2oADT zTZ>e4rQZJ@fmG3cTO1|$r@;rcnplV#$5-`e4_0f{Z!vB~fK1inx3x8w>6Q8O}{H zmB(5!@^!^&o>7L*8knUWG2hCRj5y>PjjuoXSrcCo=Y#jc@Tb%bF>xpv2xM!9P{Cp5 z4XA49H=l1G!in|En__A0k%!Eq*L5&J=-BW${Der7f zw()5tiXx4pO$SxCKdO3A0#$??y5E~x7u_T&ybz1gO_a2$@VY4&?Yt@6sHLm+Xh3{i z>cd^^Z47k@Q?%|Q#-x^&FU;LrsC`Y5E~(V-tsKbVFwE0T;mJ5Z6tGxt_O>o+5#w~2 zH2~6X>a*|8PvHg0B=0%ezqTHH^vW@znZR;9Ig%oydHMJ;uwQ@BzYFmE>^$~b`ovyz z_*&P_LYEFn9am2oKvf4zJg~34ujP+(S@+1j_cgkk7%F-IO?(*e$)Ys>5%HT$Ycqo- z%1R(KUMkSzf*WC!E*hg3#}>u?EsCU29J4(h$20xLb>(H#4J#DR27X-6w9dmlFFhJo z2LB)-72~c|;l#vMwQAcVmR-ic1adi(>-H5lShY*HI8^P3bJ6Y49M&ftjIQ<7756w} zOfv_n#`j+Y9ty$Ya6j!wO!@A9P9bI7Eq-m_sjMmY(g(~FHL;f7($TL14ZR2R`Yji* zM0`J_+zDn>Ve1Z{1QaK<^W#a8LwoZfujrv(LfKWKaD>!`Z9v_@mdH4{ZX3TzbbGRQ z{iPC%irh7%_!-g2*D6jY=oW{i@mmelQ6Fn*yU#aKFs(!64Zp>52fYj`dM6M z9`~mCyruEcdMC<7ft`+1+FaDk?d2e~7ET>jj_|zeD=@k!d zJUa7vjJFBezXJGV(6bXmxthc0l@6>Uuky0Y8R#zFwUPY{=1ECA^zi@G^3yPWM}LdpX@8@@6>}Z^v^OE} z$(cBNU za7=H(B?NeWHcJ-NC?$DD3S?um53|}cQiyN5XG7NQ@}P6~bsT=4$C?|lnz642@@*xc zz5-Bq#4c?PnD}m78KNJA1fKx`#FAOpM$VzX*fXa9<~Cr8=}<-h`_x}2B?%=%@01u% zmr^urOlyE(Pzi*C)Ho}b6zc}LzT!?s^8^3ZmiS74LEhL~E%b6_Ig{%@PI2X~B_MDW zHP0eXFuG2L^#CoLkf7;OVs?I6X$(MFWn7Gx1oQ`Q+6mDQ`NP~+<8$vPd<4a3UNJB3 z7@B_x<`GZKX93tip7hD`*zu zrd6r%Jkp1#win-ueO&pEvtThaH~k5Pop17cdw(9}?D>8lK$ZZM&AYo9XdcecTk8O! zn5$$Gsu2^bD|3FoG=)F$@(?VF2= z$y*a4>CU-}7*RS`+sG;egww`gbd`M0T9VLp*GW`1l$KV+-E_2gc8AZI;9I1|+#l@e z=wKdAFw8~5Z#WGOw{jlo0Hte@>BBL++@Nb*O;wzi`xd&NlXRqJHsP2=);3GPX*Pf+U3VuB2O-%fFRQO-jXTxsRKqfHXn2>u!EjhC3U#l3iSG`m`gRJ9HDnE9H zc zf;y9{E$c;FtJ<@XUC9%|D!v%}O`g$~O$o)V8${}1YRVO-9i?dM(eJwt&7~6Cs1;s# zbN~YoNT6OR+asA6|Kf^n=h-PtsNMj|-R*+1GB(=>@1Z#K&~k4m{3-g(j*5JRxd6cK zUey9EOy2;4aoZX{cV;i9ZRrpY^f*Sz>-t^F&l({Ph1d&#Y&a_YJb!L+rlm5juk7q)#(7E2sG=>DL z3Z~pm#5*34=c*GJ(kGGzuKo0{gS6Klc2Ar*CG*j~6cpq%9^*iwYQlKb#;5v0jZ?9h za;@2Ck$;-8T`Jc4`Wb(Y$Z~@Von&&pta#*2sVR#!m*iw@my zpQN@;+`{}6bO@n?5x;t9mgLzm;*uGVqSxR&o8wdxzqn0x$pbFe!Sj%~Zcx7GKe;Lb z?3;ea=9?&z7WU@se#fKFTHHQ7<`Po3UD6xWXZfgScqqwbfAqN{@#+o}yMg8%HUWuO zpi}UV2^7|+ab++juQIpBPV*RR z6YP=n1FuJ&6pO-Rrjz3Ex9{ERn$L>9aB{}pwh+1EudNwp#kVT^IH!_w9czg=Q9A%V zOByMf$Tp6~#Y|MdvG=l?>TCQO_RZb|bu{06gBLxWTcg}ShYH4r4TgRmhX(%|Cxf_PQ8ACc!+)`|jw&vbCmziY8 zqdu#jxbRxyCW-BPcyog7Q|vw0vl4ufqh@0_g8U@nOi%uaad*q$>@!EiY!6-1PfZNy z#3^A2)Uc1KA$~KLR^KXjz1`|+m+9)inRDLdk>Par1sfY|SA8wx3LWr@45PVTtmX;oJt5Th9qk&YKaSS!~;V^;Z{EFx%57ya0gMgqRD4r@K|$Nu0^;P zFN$6Y<$9K-i0?>fqeE=cOrGsX=kQ4ADRxKp;*A+BsI`(IQS5UH~ z4RYx86@7DK5VN}9+DeFdnt<)fpua?j-g<3CYH*5-BjM`R`tA^!^N{1*Aq{Q<2X+-L zTZxUa!5P$QoK24RDb{|IijnK^=SpyO{lh7v&AgT+rYjXaE*fG?oVtS{<__Pina{Q7 zGcmAZ-~$igb*@ap<@I^=6Xg(yTU02}-XIXpJq%|Jn9$EF4)lRz{lHIkk`7BC5RJ=p zwj2U^`rse;(f>Zq#z=Q;L(~rQ-{v{q;q{5_xMZg$z?nsVjNA79MkPUBOT zw;ukXCQvdT+W(;Eg-DlB-cGJ(L$aNpH+FTH_7(GrOUFz|bANMqczE`Tv7>3VvOsA< zL6S9_7E@c13 zfCm_mZ;1M*X+8c!D~yP=#U`~QOG6Ed!=RJR(Ma%uhTVnsH!F^@YB8`UF={L`T^X<_ z!~pei#H8MfjCCJEiaudvw=U7XnkWQc7_WQxb!nS6u{5c#vgjgY@!n-L1`d}GfPN2` z^15rcOU$S4)QF7DEK3#ZJzNi3^!y~LEN~NR3Xeey-mHLl`r1^VQ;0t&$XOaJWFyUg+1s! zuh|ltdUA4CBD=0JFvIt35sxh{va=#K_=M{Wf$%ErdMyj*l?8=!QKRN=Og%l%(M2$| zz~#hUAB2nYQ=4b%Xg!mxVrO{u?D)2=j4S)u`-Uj22_z~iDy`tc;rkKmmKNQUEG2AC z7fy9>*p@%2ue4>(%52n`Ha^<>9y;hZA0gv+9lj-d$~2%tw58a72ly+l396nM2uhzf zTU0hM$e*k_E0*zp)p4dxO_5ElYXFoc+>kWK`?xmmVLlVy#t5ze8Dd|`D{%4n%QDvjzHFx>6|lo$*4R>B5s9na48A%&ClMJ=g#9sU+>>Qknpv($P0izW_dI3Y zo&sBb#EN^N?CIk8vVP3L%zc3M;c7Z4+0F&|&6&tH<(6Q-va#m(kdkN}_z2S6KyJ<21LB#VGdJ@QOT4 zm4BDfhAxiWz((qu@H0O3$kfO6a&30n^|iCKBh|+kDoZJIS1F4=s;6*B*jlGLzq$*0 z!{6Bb$@Javfm*y_dATVt)WMme#QusF<6=e#YI~9Ad=8({uOq*rjTkZ=3H!-^)3Q3g zy^nDUk(hD7p8Fua(S0!X)6!gz#Tyh{Ld}2fHizcbnCTb>_StnRY#t@lDz=i0$6)d1 zAwE_9Xy01Dknm|R#Giav$0)z@`pe27+R9`%G04(kY!cRtQ)Vzj4W4i7XbFl3*j`T< zN!~L4TE-(Y-!0mbXChM)$~N<|CJs;5FnPjma?+i51LC(o%lMRoGIJIC#z&TtlGUw{ zOY0pvQ@Jw_uD8ze3oewVBSeApHh-AcC~guSBh^%csl)mFgt&-cXA-D2uAHMp3rbaZ z9>m)g-F>|4V@Jd_c3_i(rvF1+n8>gfS;GTGa-+ufwa3O|0y54ebHB0Xda&AMWx%st&g~M0~zBi12ObQ zt3Sl)mp|^8x4w{bQtbeA%%kDnv_0~)TZ1MXU9GFlysy+UB7JLr;vNP?AP~F{2Cr07 zAvFnsZ4#wuN$HwoQ!Mm*!sEqU*;2<%S%YQ=$-p1}l!d3pF^d&^+*&4M7OVpi;5VyG~g_GD?Vy@=fj zFbW3sXuUq*?$tHfIO|dQxoX|izC`yT`W7nv<$5@)e%ECzw`4KV)k@LU$^av`v$x8<~s{lVfngny#0y3ROL$27E&Dul6aymV|EA|n2+bST+ z7uJm_GpZ&dC-(cTi$ literal 0 HcmV?d00001 From b17ddf03529787e4449c23fc3d8ae2e7494afc0d Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 22:44:44 -0400 Subject: [PATCH 11/13] readme --- README.md | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e7440e0..2d72bb5 100644 --- a/README.md +++ b/README.md @@ -13,14 +13,6 @@ The results are marked by the number of requirements. * GPU Scan implementations (not included in code)(Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. - GPU/CPU method: - | Method | time1 (ms) |time2 (ms)| - |CPUscan|0|0| - | Thrust: | 0.002304 |0.002304| - | Naive | 0.002304 |0.002304| - |Work-Efficient compact|0.007|0.007| - |Work-Efficient scan|0.006|0.007| - (time1 is power of two, time2 is non power of two) ![](graph.png) *the time line of thrust is seperate. It runs evey few seconds. @@ -35,18 +27,18 @@ The results are marked by the number of requirements. **************** [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] ==== cpu scan, power-of-two ==== -1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + 1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] ==== cpu scan, non-power-of-two ==== -1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + 1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] passed ==== naive scan, power-of-two ==== -2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + 2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] passed ==== naive scan, non-power-of-two ==== -2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + 2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] passed ==== work-efficient scan, power-of-two ==== -3.1 + 3.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] passed ==== work-efficient scan, non-power-of-two ==== From 8f614080685c3360567cf8ae608aae7052fdb518 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 23:20:36 -0400 Subject: [PATCH 12/13] readme --- README.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2d72bb5..b6752a3 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,14 @@ CUDA Stream Compaction ### README The results are marked by the number of requirements. + 1 : CPU Scan & Compaction + 2 : Naive Scan (naive.cu) + 3.1 : Work-Efficient Scan + 3.2 Compaction (efficient.cu) + 4 : Thrust Scan (thrust.cu) ### Questions +blocksize compare: to be honest,changing blocksize does not make much difference.. * GPU Scan implementations (not included in code)(Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. @@ -25,17 +31,22 @@ The results are marked by the number of requirements. **************** ** SCAN TESTS ** **************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] ==== cpu scan, power-of-two ==== - 1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + 1.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] ==== cpu scan, non-power-of-two ==== - 1.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + 1.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] passed ==== naive scan, power-of-two ==== - 2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + 2.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] passed ==== naive scan, non-power-of-two ==== - 2.1 [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + 2.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] passed ==== work-efficient scan, power-of-two ==== 3.1 From 01893bf3adabd160e4bb6b67fa9a0077a5d89f11 Mon Sep 17 00:00:00 2001 From: z <1090589429@qq.com> Date: Sun, 13 Sep 2015 23:46:51 -0400 Subject: [PATCH 13/13] bug fix --- stream_compaction/efficient.cu | 42 +++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2a19417..92e4752 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -65,18 +65,31 @@ void scan(int n, int *odata, const int *idata) { else num = n; int *_idata = new int[num]; init(num, idata); + float ms=0; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&ms, start, stop); for (int d = 0; d <= ilog2ceil(num) - 1; d++){ p1 = pow(2, d); p2 = pow(2, d + 1); - Uscan << <1, num >> >(p1, p2, dev_A1); + Uscan << <1, 512 >> >(p1, p2, dev_A1); } put0 << <1, 1 >> >(dev_A1, num); for (int d = ilog2ceil(num) - 1; d >= 0; d--){ p1 = pow(2, d); p2 = pow(2, d + 1); - Dscan << <1, num >> >(p1, p2, dev_A1); + Dscan << <1, 512 >> >(p1, p2, dev_A1); } + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&ms, start, stop); + printf("\t time of 3.1 efficient function1: %f ms\n", ms); cudaMemcpy(odata, dev_A1, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source, cudaFree(dev_A1); @@ -112,7 +125,7 @@ int compact(int n, int *odata, const int *idata) { cudaMalloc((void**)&dev_bool, _size); cudaMalloc((void**)&dev_boolb, _size); - + cudaMalloc((void**)&dev_odata, _size); cudaMalloc((void**)&dev_idata, _size); cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice); @@ -120,11 +133,13 @@ int compact(int n, int *odata, const int *idata) { int hst; int last; //step 1 + Common::kernMapToBoolean <<< 1, n >>>(n, dev_bool, dev_idata); - cudaMemcpy(dev_boolb, dev_bool, _size, cudaMemcpyDeviceToDevice); + Common::kernMapToBoolean << < 1, n >> >(n, dev_boolb, dev_idata);//back_up //cudaMemcpy(&hst, &dev_idata[6],sizeof(int), cudaMemcpyDeviceToHost); //std::cout << hst; //Step 2 + for (int d = 0; d <= ilog2ceil(num) - 1; d++){ p1 = pow(2, d); p2 = pow(2, d + 1); @@ -138,28 +153,23 @@ int compact(int n, int *odata, const int *idata) { p2 = pow(2, d + 1); Dscan <<<1, num >> >(p1, p2, dev_boolb); } + + + //???????????my dev_idata changed its value here...have no idea why. cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice); - //my dev_idata changed...have no idea why. - ////////////???????????????????????///////////// + ////???????????????????????///////////// //cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost); //std::cout << hst << "ss2"; //Step 3 : Scatter //cudaMemcpy(&hst, &dev_idata[2],sizeof(int), cudaMemcpyDeviceToHost); //std::cout << hst; cudaMemcpy(&last, &(dev_boolb[num - 1]), sizeof(int), cudaMemcpyDeviceToHost); - cudaMalloc((void**)&dev_odata, last*sizeof(int)); - + //cudaMalloc((void**)&dev_odata, last*sizeof(int)); + Common::kernScatter <<<1, num >> >(last, dev_odata, dev_idata, dev_bool, dev_boolb); - + cudaMemcpy(odata, dev_odata, last*sizeof(int), cudaMemcpyDeviceToHost); - cudaFree(dev_bool); - cudaFree(dev_boolb); - cudaFree(dev_odata); - - cudaFree(dev_idata); - - printf("3.2\n"); return last; }