diff --git a/README.md b/README.md index b71c458..b4b1320 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,366 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Kaixiang Miao +* Tested on: Windows 7, i7-3630QM @ 2.40GHz 8GB, GTX 660M 2GB (Lenovo Y580 laptop, personal computer) -### (TODO: Your README) +## Screenshot -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +All tests passed. +![](./img/passed.jpg) + +## Performance Analysis + + +### Scan Comparison + +| length of input| cpu scan (ms) | naive scan (ms) | non-optimized efficient scan (ms) | optimized efficient scan (ms) |thrust scan (ms)| +|----------------|---------------|-----------------|-----------------------------------|-------------------------------|----------------| +| 2^10 | 0 | 0.0418 | 0.0882 | 0.0954 |0 | +| 2^13 | 0 | 0.0761 | 0.1681 | 0.1473 |0 | +| 2^16 | 0 | 0.4851 | 0.9168 | 0.3155 |0 | +| 2^19 | 2.0001 | 3.5169 | 7.7984 | 1.6069 |1 | +| 2^22 | 33.0019 | 31.4348 | 61.9076 | 11.3602 |3.002 | + +**non-optimized efficient scan:** + + __global__ void CudaUpSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = 1 << (d + 1); + if (!(thid % m)) + data[thid + m - 1] += data[thid + (m >> 1) - 1]; + } + + __global__ void CudaDownSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = 1 << (d + 1); + if (!(thid % m)) + { + int temp = data[thid + (m >> 1) - 1]; + data[thid + (m >> 1) - 1] = data[thid + m - 1]; + data[thid + m - 1] += temp; + } + } + +**optimized efficient scan:** + + __global__ void CudaUpSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; + } + + __global__ void CudaDownSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = (thid + 1) * (1 << (d + 1)); + int temp = data[m - 1 - (1 << d)]; + data[m - 1 - (1 << d)] = data[m - 1]; + data[m - 1] += temp; + } + +### Block Size Comparison + +Comparing blocksize = 1024, 512, 256, 128, 64, it seems that *128 <= blocksize <= 512* performs better on my machine. + +![](./img/table.jpg) + +## Question + +- **Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.** + +See Performance Analysis. + +- **Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis).** + +See Performance Analysis. + +As mentioned in GPU Gem 3, I guess thrust::exclusive_scan is using shared memory instread of global memory in my implementation. Accessing to shared memory in a block is more efficient than accessing to global memory. + +- **Write a brief explanation of the phenomena you see here.** + +The problem of non-optimized efficient scan is that lots of threads are doing nothing because we're examining if the index is divisible in the thread. The solution is to eliminate these threads. Actually we're able to access the useful indices of data directly by deducing some formulas using the index of thread. + +- **Paste the output of the test program into a triple-backtick block in your README.** + +The output of the test program: + + **************** + ** SCAN TESTS ** + **************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] + ==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 41108905 + 6 ] + ==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 41108897 + 4 ] + passed + ==== naive scan, power-of-two ==== + passed + ==== naive scan, non-power-of-two ==== + passed + ==== work-efficient scan, power-of-two ==== + passed + ==== work-efficient scan, non-power-of-two ==== + passed + ==== thrust scan, power-of-two ==== + passed + ==== thrust scan, non-power-of-two ==== + passed + + ***************************** + ** STREAM COMPACTION TESTS ** + ***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] + ==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + ==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + ==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + ==== work-efficient compact, power-of-two ==== + passed + ==== work-efficient compact, non-power-of-two ==== + passed + +## Extra Credits + +### Optimized efficient scan + +See performance analysis. + +### Radix sort + +Implemented in radix.h and radix.cu. + + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 256 + Time:0.893440 ms + Unsorted Array: + [ 129 65 3 2 164 225 233 65 163 238 5 32 39 ... 194 0 ] + Sorted Array: + [ 0 0 0 0 1 1 2 3 3 5 5 5 6 ... 253 254 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 512 + Time:1.707232 ms + Unsorted Array: + [ 359 307 257 255 382 173 233 269 365 470 220 4 277 ... 263 0 ] + Sorted Array: + [ 0 1 1 1 3 3 4 4 5 5 6 6 7 ... 510 510 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 1024 + Time:2.026848 ms + Unsorted Array: + [ 858 812 768 254 875 147 233 755 850 970 200 1013 780 ... 280 0 ] + Sorted Array: + [ 0 0 0 3 4 4 4 5 5 6 8 10 11 ... 1021 1022 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 2048 + Time:1.603968 ms + Unsorted Array: + [ 852 809 768 1277 866 134 233 1766 837 1988 190 2030 776 ... 6 0 ] + Sorted Array: + [ 0 0 0 2 3 6 6 6 8 8 9 10 12 ... 2046 2046 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 4096 + Time:2.233344 ms + Unsorted Array: + [ 849 2855 768 1277 2909 2175 233 1760 2878 4033 185 2027 774 ... 432 0 ] + Sorted Array: + [ 0 0 0 1 2 3 6 7 8 9 9 9 11 ... 4093 4094 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 8192 + Time:3.248928 ms + Unsorted Array: + [ 4943 6950 768 1277 2907 2172 233 1757 2875 4032 4278 6121 773 ... 5148 0 + ] + Sorted Array: + [ 0 0 0 1 1 3 3 5 5 7 7 7 8 ... 8189 8189 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 16384 + Time:3.312192 ms + Unsorted Array: + [ 13134 6950 768 1277 2906 10362 233 9947 11065 12223 4277 14312 8964 ... 53 + 62 0 ] + Sorted Array: + [ 0 1 1 1 1 2 3 4 5 5 6 6 7 ... 16382 16382 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 32768 + Time:4.561664 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 18084 0 ] + Sorted Array: + [ 0 0 0 1 1 1 2 4 5 7 7 7 8 ... 32764 32765 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 65536 + Time:6.343616 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 27145 0 ] + Sorted Array: + [ 0 0 1 1 1 1 1 1 2 2 2 4 4 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 131072 + Time:10.497824 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 12498 0 ] + Sorted Array: + [ 0 0 1 1 1 1 1 1 1 1 1 1 2 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 262144 + Time:17.427200 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 15972 0 ] + Sorted Array: + [ 0 0 0 0 0 1 1 1 1 1 1 1 1 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 524288 + Time:32.032894 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 22920 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 1048576 + Time:61.234879 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 4048 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 2097152 + Time:122.582626 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 31840 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 4194304 + Time:232.911423 ms + Unsorted Array: + [ 13138 17699 18632 25340 29604 30672 3346 23402 11192 6075 20512 14356 3110 + 2 ... 20531 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 8388608 + Time:438.904877 ms + Unsorted Array: + [ 13138 17699 18632 25340 29604 30672 3346 23402 11192 6075 20512 14356 3110 + 2 ... 17267 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 16777216 + Time:875.574402 ms + Unsorted Array: + [ 13144 6428 21592 7931 17465 5758 9571 17548 11447 26547 20216 14444 9844 . + .. 9561 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed diff --git a/img/passed.jpg b/img/passed.jpg new file mode 100644 index 0000000..ea271c7 Binary files /dev/null and b/img/passed.jpg differ diff --git a/img/table.jpg b/img/table.jpg new file mode 100644 index 0000000..91ee62d Binary files /dev/null and b/img/table.jpg differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..269cf98 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,113 +11,133 @@ #include #include #include +#include #include "testing_helpers.hpp" +#include +using namespace std; +#define maxlength (1 << 25) +int a[maxlength], b[maxlength], c[maxlength]; int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; - + int SIZE = 1 << 8; + int NPOT = SIZE - 3; + +for (int SIZE = 1 << 8; SIZE < maxlength; SIZE <<= 1) +{ + NPOT = SIZE - 3; // Scan tests printf("\n"); printf("****************\n"); - printf("** SCAN TESTS **\n"); + printf("** RADIX SORT TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, SIZE - 1); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + zeroArray(SIZE, b); + printDesc("Radix Sort"); + printf("Array Size = %d\n", SIZE); + StreamCompaction::Radix::RadixSort(SIZE, b, a); + printf("Unsorted Array:\n"); + printArray(SIZE, a, true); + printf("Sorted Array:\n"); + printArray(SIZE, b, true); + sort(a, a + SIZE); + printCmpResult(SIZE, b, a); + + //zeroArray(SIZE, b); + //printDesc("cpu scan, power-of-two"); + //StreamCompaction::CPU::scan(SIZE, b, a); + //printArray(SIZE, b, true); + + //zeroArray(SIZE, c); + //printDesc("cpu scan, non-power-of-two"); + //StreamCompaction::CPU::scan(NPOT, c, a); + //printArray(NPOT, b, true); + //printCmpResult(NPOT, b, c); + + //zeroArray(SIZE, c); + //printDesc("naive scan, power-of-two"); + //StreamCompaction::Naive::scan(SIZE, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("naive scan, non-power-of-two"); + //StreamCompaction::Naive::scan(NPOT, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(NPOT, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient scan, power-of-two"); + //StreamCompaction::Efficient::scan(SIZE, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient scan, non-power-of-two"); + //StreamCompaction::Efficient::scan(NPOT, c, a); + ////printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + //zeroArray(SIZE, c); + //printDesc("thrust scan, power-of-two"); + //StreamCompaction::Thrust::scan(SIZE, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("thrust scan, non-power-of-two"); + //StreamCompaction::Thrust::scan(NPOT, c, a); + ////printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + //printf("\n"); + //printf("*****************************\n"); + //printf("** STREAM COMPACTION TESTS **\n"); + //printf("*****************************\n"); + + //// Compaction tests + + //genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + //a[SIZE - 1] = 0; + //printArray(SIZE, a, true); + + //int count, expectedCount, expectedNPOT; + + //zeroArray(SIZE, b); + //printDesc("cpu compact without scan, power-of-two"); + //count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + //expectedCount = count; + //printArray(count, b, true); + //printCmpLenResult(count, expectedCount, b, b); + + //zeroArray(SIZE, c); + //printDesc("cpu compact without scan, non-power-of-two"); + //count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + //expectedNPOT = count; //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + //printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + //zeroArray(SIZE, c); + //printDesc("cpu compact with scan"); + //count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + //printCmpLenResult(count, expectedCount, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient compact, power-of-two"); + //count = StreamCompaction::Efficient::compact(SIZE, c, a); + ////printArray(count, c, true); + //printCmpLenResult(count, expectedCount, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient compact, non-power-of-two"); + //count = StreamCompaction::Efficient::compact(NPOT, c, a); + ////printArray(count, c, true); + //printCmpLenResult(count, expectedNPOT, b, c); +} + + system("pause"); } diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index f6b572f..adda3cb 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,7 +1,7 @@ #pragma once #include - +#include template int cmpArrays(int n, T *a, T *b) { for (int i = 0; i < n; i++) { @@ -40,7 +40,7 @@ void zeroArray(int n, int *a) { } void genArray(int n, int *a, int maxval) { - srand(0); + srand(time(NULL)); for (int i = 0; i < n; i++) { a[i] = rand() % maxval; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..be14bcb 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.cu" + "radix.h" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..c53b355 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,8 @@ #include +#include "stream_compaction\common.h" #include "cpu.h" +#include +#include namespace StreamCompaction { namespace CPU { @@ -9,7 +12,71 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + //int n = 8; + //int idata[8] ={0,1,2,3,4,5,6,7}; + //int odata[8]={0}; + if (n <= 0) + return; + + memcpy(odata, idata, n * sizeof(int)); + int nCeilLog = ilog2ceil(n); + + int nLength = 1 << nCeilLog; + + //for (int d = 0; d < nCeilLog; d++) + // for (int k = 0; k < nLength; k++) + // { + // int m = 1 << (d + 1); + // if (!(k % m)) + // odata[k + m - 1] += odata[k + (m >> 1) - 1]; + // } + + + for (int d = 0; d < nCeilLog; d++) + { + int addTimes = 1 << (nCeilLog - 1 - d); + for (int k = 0; k < addTimes; k++) + { + int m = (k + 1) * (1 << (d + 1)); + //printf("%d %d\n",m - 1, m - 1 - (1 << d)); + odata[m - 1] += odata[m - 1 - (1 << d)]; + } + } + + //odata[nLength - 1] = 0; + //for (int d = nCeilLog - 1; d >= 0; d--) + // for (int k = 0; k < nLength; k++) + // { + // int m = 1 << (d + 1); + // if (!(k % m)) + // { + // int index1 = k + (m >> 1) - 1; + // int index2 = k + m - 1; + // int temp = odata[index1]; + // odata[index1] = odata[index2]; + // odata[index2] += temp; + // } + // } + //for (int i = 0; i < 8; i++) + //printf("%d ", odata[i]); + //printf("\n"); + odata[nLength - 1] = 0; + for (int d = nCeilLog - 1; d >= 0; d--) + { + int addTimes = 1 << (nCeilLog - 1 - d); + for (int k = 0; k < addTimes; k++) + { + int m = (k + 1) * (1 << (d + 1)); + int index1 = m - 1 - (1 << d); + int index2 = m - 1; +// printf("%d %d\n", index1, index2); + int temp = odata[index1]; + odata[index1] = odata[index2]; + odata[index2] += temp; + } + } + //for (int i = 0; i < 8; i++) + // printf("%d ", odata[i]); } /** @@ -19,7 +86,15 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + if (n <= 0) + return -1; + int counter = 0; + for (int i = 0; i < n; i++) + { + if (idata[i]) + odata[counter++] = idata[i]; + } + return counter; } /** @@ -29,7 +104,27 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + if (n <= 0) + return -1; + int counter = 0; + + for (int i = 0; i < n; i++) + odata[i] = idata[i] ? 1 : 0; + auto start = std::chrono::system_clock::now(); + scan(n, odata, odata); + auto end = std::chrono::system_clock::now(); + //auto duration = std::chrono::duration_cast(end - start); + std::chrono::duration duration = end - start; + //std::cout << duration.count() << std::endl; + //FILE* fp = fopen("efficient.txt", "a+"); + //fprintf(fp, "%d %f\n", ilog2ceil(n), duration.count()); + //fclose(fp); + for (int i = 0; i < n - 1; i++) + { + if (odata[i] != odata[i + 1]) + odata[counter++] = idata[i]; + } + return counter; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..0ee0002 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -11,11 +11,148 @@ namespace Efficient { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + +int *dev_Data; +int *dev_Flag; +int *dev_ScanResult; +int *dev_OutputData; +int *dev_total; + +int threadPerBlock = 64; +int BlockNum; +//__global__ void CudaUpSweep(int d, int *data, int addTimes) +//{ +// int thid = (blockIdx.x * blockDim.x) + threadIdx.x; +// if (thid >= addTimes) +// return; +// int m = 1 << (d + 1); +// if (!(thid % m)) +// data[thid + m - 1] += data[thid + (m >> 1) - 1]; +//} +// +//__global__ void CudaDownSweep(int d, int *data, int addTimes) +//{ +// int thid = (blockIdx.x * blockDim.x) + threadIdx.x; +// if (thid >= addTimes) +// return; +// int m = 1 << (d + 1); +// if (!(thid % m)) +// { +// int temp = data[thid + (m >> 1) - 1]; +// data[thid + (m >> 1) - 1] = data[thid + m - 1]; +// data[thid + m - 1] += temp; +// } +//} +__global__ void CudaUpSweep(int d, int *data, int addTimes) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; +} + +__global__ void CudaDownSweep(int d, int *data, int addTimes) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = (thid + 1) * (1 << (d + 1)); + int temp = data[m - 1 - (1 << d)]; + data[m - 1 - (1 << d)] = data[m - 1]; + data[m - 1] += temp; +} + void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // int n = 8; + //int idata[8] ={0,1,2,3,4,5,6,7}; + //int odata[8]; + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + + cudaMalloc((void**)&dev_Data, nLength * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + + cudaMemcpy(dev_Data, idata, sizeof(int) * nLength, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + for (int i = 0; i < nCeilLog; i++) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaUpSweep<<>>(i, dev_Data, addTimes); + } + + cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); + checkCUDAError("cudaMemset failed!"); + for (int i = nCeilLog - 1; i >= 0; i--) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaDownSweep<<>>(i, dev_Data, addTimes); + } + //for (int i = 0; i < nCeilLog; i++) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaUpSweep<<>>(i, dev_Data, nLength); + //} + + //cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); + //checkCUDAError("cudaMemset failed!"); + //for (int i = nCeilLog - 1; i >= 0; i--) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaDownSweep<<>>(i, dev_Data, nLength); + //} + + cudaMemcpy(odata, dev_Data, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + // for (int j = 0; j < n; j++) + // printf("%d ", odata[j]); + //printf("\n"); + cudaFree(dev_Data); +} + +__global__ void CudaGetFlag(int *out, int *in, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out[thid] = in[thid] ? 1 : 0; +} + +__global__ void CudaGetResult(int *result, int *flag, int *scanResult, int *data, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + if (flag[thid]) + result[scanResult[thid]] = data[thid]; } +__global__ void CudaGetTotal(int *total, int *flag, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + if (flag[thid]) + { + total[0] = total[0] + total[1]; + total[1] = 100; + printf("%d %d %d\n", thid, flag[thid], total[0]); + } +} + +void test(int *buffer, int size) +{ + int *cao = new int[size]; + cudaMemcpy(cao, buffer, sizeof(int) * size, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + + for (int i = 0; i < size; i++) + printf("%d ", cao[i]); + printf("\n"); + delete [] cao; +} /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -27,7 +164,90 @@ void scan(int n, int *odata, const int *idata) { */ int compact(int n, int *odata, const int *idata) { // TODO - return -1; + if (n <= 0) + return -1; + + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + + cudaMalloc((void**)&dev_Data, nLength * sizeof(int)); + cudaMalloc((void**)&dev_ScanResult, nLength * sizeof(int)); + cudaMalloc((void**)&dev_Flag, nLength * sizeof(int)); + cudaMalloc((void**)&dev_OutputData, n * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + + cudaMemcpy(dev_Data, idata, nLength * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + // dev_Flag is 0 or 1, calculate dev_Flag + BlockNum = nLength / threadPerBlock + 1; + CudaGetFlag<<>>(dev_Flag, dev_Data, nLength); + + + + + + + // now scan + cudaMemcpy(dev_ScanResult, dev_Flag, nLength * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy device to device failed!"); + +float time_elapsed=0; +cudaEvent_t start,stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +cudaEventRecord( start,0); + + for (int i = 0; i < nCeilLog; i++) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaUpSweep<<>>(i, dev_ScanResult, addTimes); + } + cudaMemset(dev_ScanResult + nLength - 1, 0, sizeof(int)); + checkCUDAError("cudaMemcpy to device failed!"); + for (int i = nCeilLog - 1; i >= 0; i--) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaDownSweep<<>>(i, dev_ScanResult, addTimes); + } + //for (int i = 0; i < nCeilLog; i++) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaUpSweep<<>>(i, dev_ScanResult, nLength); + //} + //cudaMemset(dev_ScanResult + nLength - 1, 0, sizeof(int)); + //checkCUDAError("cudaMemcpy to device failed!"); + //for (int i = nCeilLog - 1; i >= 0; i--) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaDownSweep<<>>(i, dev_ScanResult, nLength); + //} + cudaEventRecord( stop,0); +cudaEventSynchronize(start); +cudaEventSynchronize(stop); +cudaEventElapsedTime(&time_elapsed,start,stop); +//FILE* fp = fopen("efficient.txt", "a+"); +//fprintf(fp, "%d %f\n", nCeilLog, time_elapsed); +//fclose(fp); + + BlockNum = n / threadPerBlock + 1; + CudaGetResult<<>>(dev_OutputData, dev_Flag, dev_ScanResult, dev_Data, n); + cudaMemcpy(odata, dev_OutputData, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + + int total, flag; + cudaMemcpy(&total, dev_ScanResult + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&flag, dev_Flag + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy device to device failed!"); + + cudaFree(dev_Data); + cudaFree(dev_ScanResult); + cudaFree(dev_Flag); + cudaFree(dev_OutputData); + + return flag ? total + 1 : total; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..ca7b617 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,10 +11,65 @@ namespace Naive { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ +int threadPerBlock = 1024; +int BlockNum; + +int *dev_Data[2]; + +__global__ void CudaScan(int d, int *in, int *out, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + int m = 1 << (d - 1); + + if (thid >= m) + out[thid] = in[thid] + in[thid - m]; + else + out[thid] = in[thid]; + +} + void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + + cudaMalloc((void**)&dev_Data[0], nLength * sizeof(int)); + cudaMalloc((void**)&dev_Data[1], nLength * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + + cudaMemcpy(dev_Data[0], idata, sizeof(int) * nLength, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + int nOutputIndex = 0; + float time_elapsed=0; +cudaEvent_t start,stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +cudaEventRecord( start,0); + for (int i = 1; i <= nCeilLog; i++) + { + nOutputIndex ^= 1; + BlockNum = nLength / threadPerBlock + 1; + CudaScan<<>>(i, dev_Data[nOutputIndex ^ 1], dev_Data[nOutputIndex], nLength); + } + cudaEventRecord( stop,0); +cudaEventSynchronize(start); +cudaEventSynchronize(stop); +cudaEventElapsedTime(&time_elapsed,start,stop); +//FILE* fp = fopen("efficient.txt", "a+"); +//fprintf(fp, "%d %f\n", nCeilLog, time_elapsed); +//fclose(fp); + odata[0] = 0; + cudaMemcpy(odata + 1, dev_Data[nOutputIndex], sizeof(int) * (n - 1), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + + cudaFree(dev_Data[0]); + cudaFree(dev_Data[1]); } + + } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..214f52d --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,184 @@ +#include +#include +#include "common.h" +#include "naive.h" + +namespace StreamCompaction { +namespace Radix { + +int *dev_Data[2]; +int *dev_bArray; +int *dev_eArray; +int *dev_fArray; +int *dev_tArray; +int *dev_dArray; + +__global__ void CudaUpSweep(int d, int *data, int addTimes) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; +} + +__global__ void CudaDownSweep(int d, int *data, int addTimes) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = (thid + 1) * (1 << (d + 1)); + int temp = data[m - 1 - (1 << d)]; + data[m - 1 - (1 << d)] = data[m - 1]; + data[m - 1] += temp; +} + +__global__ void CudaGetBEArray(int pass, int *in, int *out1, int *out2, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out1[thid] = (in[thid] >> pass) & 1; + out2[thid] = out1[thid] ^ 1; +} + +__global__ void CudaGetTArray(int *in, int *e, int *out, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + int totalFalses = in[n - 1] + e[n - 1]; + if (thid >= n) + return; + out[thid] = thid - in[thid] + totalFalses; +} + +__global__ void CudaGetDArray(int *in1, int *in2, int *in3, int *out, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out[thid] = in1[thid] ? in2[thid] : in3[thid]; +} + +__global__ void CudaGetResult(int *in, int *out, int *data, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out[in[thid]] = data[thid]; +} + +void RadixSort(int n, int *odata, int *idata) +{ + //int n = 8; + //int idata[8] = {4,7,2,6,3,5,1,0}; + //int odata[8]; + + int threadPerBlock = 512; + int BlockNum; + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + int pout = 1; + cudaMalloc((void**)&dev_Data[0], n * sizeof(int)); + cudaMalloc((void**)&dev_Data[1], n * sizeof(int)); + cudaMalloc((void**)&dev_bArray, n * sizeof(int)); + cudaMalloc((void**)&dev_eArray, nLength * sizeof(int)); + cudaMalloc((void**)&dev_fArray, nLength * sizeof(int)); + cudaMalloc((void**)&dev_tArray, n * sizeof(int)); + cudaMalloc((void**)&dev_dArray, n * sizeof(int)); + cudaMemset(dev_eArray, 0, nLength * sizeof(int)); + cudaMemcpy(dev_Data[0], idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + checkCUDAError("cudaMalloc failed!"); + + int maxn = 0; + for (int i = 0; i < n; i++) + if (idata[i] > maxn) + maxn = idata[i]; +float time_elapsed = 0.0f; +cudaEvent_t start,stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +cudaEventRecord( start,0); + int pass = 0; + while (1) + { + int pin = pout ^ 1; + if ((maxn >> pass) == 0) + break; + BlockNum = n / threadPerBlock + 1; + CudaGetBEArray<<>>(pass, dev_Data[pin], dev_bArray, dev_eArray, n); + cudaMemcpy(dev_fArray, dev_eArray, nLength * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy device to device failed!"); + + for (int i = 0; i < nCeilLog; i++) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaUpSweep<<>>(i, dev_fArray, addTimes); + } + + cudaMemset(dev_fArray + nLength - 1, 0, sizeof(int)); + checkCUDAError("cudaMemset failed!"); + for (int i = nCeilLog - 1; i >= 0; i--) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaDownSweep<<>>(i, dev_fArray, addTimes); + } + +//printf("F:"); +//cudaMemcpy(odata, dev_fArray, sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); + + BlockNum = n / threadPerBlock + 1; + CudaGetTArray<<>>(dev_fArray, dev_eArray, dev_tArray, n); + +//printf("T:"); +//cudaMemcpy(odata, dev_tArray, sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); + + CudaGetDArray<<>>(dev_bArray, dev_tArray, dev_fArray, dev_dArray, n); + CudaGetResult<<>>(dev_dArray, dev_Data[pout], dev_Data[pin], n); +//printf("D:"); +//cudaMemcpy(odata, dev_dArray, sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); +//printf("output:"); +//cudaMemcpy(odata, dev_Data[pout], sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); + pass++; + pout ^= 1; + } +cudaEventRecord( stop,0); +cudaEventSynchronize(start); +cudaEventSynchronize(stop); +cudaEventElapsedTime(&time_elapsed,start,stop); +printf("Time:%f ms\n", time_elapsed); + cudaMemcpy(odata, dev_Data[pout ^ 1], sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + //for (int i = 0; i < n; i++) + // printf("%d ", odata[i]); + //printf("\n"); + + cudaFree(dev_Data[0]); + cudaFree(dev_Data[1]); + cudaFree(dev_bArray); + cudaFree(dev_eArray); + cudaFree(dev_fArray); + cudaFree(dev_tArray); + cudaFree(dev_dArray); +} + + +} +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..c9cf412 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,7 @@ +#pragma once + +namespace StreamCompaction { +namespace Radix { + void RadixSort(int n, int *odata, int *idata); +} +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..f16aa6f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,7 @@ #include #include "common.h" #include "thrust.h" +#include namespace StreamCompaction { namespace Thrust { @@ -16,6 +17,21 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(odata, odata + n); + auto start = std::chrono::system_clock::now(); + //thrust::exclusive_scan(idata, idata + n, odata); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + auto end = std::chrono::system_clock::now(); + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); + + //auto duration = std::chrono::duration_cast(end - start); + auto duration = std::chrono::duration_cast(end-start).count(); + //std::cout << duration << std::endl; + FILE* fp = fopen("efficient.txt", "a+"); + fprintf(fp, "%d %I64d\n", ilog2ceil(n), duration); + fclose(fp); } }