From e249e876dc42b006db015de3ddd7d29bfcdc7c34 Mon Sep 17 00:00:00 2001 From: immiao Date: Fri, 23 Sep 2016 15:40:23 -0400 Subject: [PATCH 1/8] Finished cpu.cu --- src/main.cpp | 2 ++ stream_compaction/cpu.cu | 56 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 675da35..e5af2ba 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -120,4 +120,6 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + system("pause"); } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..0af1c03 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include "stream_compaction\common.h" #include "cpu.h" namespace StreamCompaction { @@ -9,7 +10,36 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + if (n <= 0) + return; + + memcpy(odata, idata, n * sizeof(int)); + int nCeilLog = ilog2ceil(n); + + int nLength = 1 << nCeilLog; + + for (int d = 0; d < nCeilLog; d++) + for (int k = 0; k < nLength; k++) + { + int m = 1 << (d + 1); + if (!(k % m)) + odata[k + m - 1] += odata[k + (m >> 1) - 1]; + } + + odata[nLength - 1] = 0; + for (int d = nCeilLog - 1; d >= 0; d--) + for (int k = 0; k < nLength; k++) + { + int m = 1 << (d + 1); + if (!(k % m)) + { + int index1 = k + (m >> 1) - 1; + int index2 = k + m - 1; + int temp = odata[index1]; + odata[index1] = odata[index2]; + odata[index2] += temp; + } + } } /** @@ -19,7 +49,15 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + if (n <= 0) + return -1; + int counter = 0; + for (int i = 0; i < n; i++) + { + if (idata[i]) + odata[counter++] = idata[i]; + } + return counter; } /** @@ -29,7 +67,19 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + if (n <= 0) + return -1; + int counter = 0; + + for (int i = 0; i < n; i++) + odata[i] = idata[i] ? 1 : 0; + scan(n, odata, odata); + for (int i = 0; i < n - 1; i++) + { + if (odata[i] != odata[i + 1]) + odata[counter++] = idata[i]; + } + return counter; } } From 1342e801bddff6b83c63e52f18adb28b0c308294 Mon Sep 17 00:00:00 2001 From: immiao Date: Fri, 23 Sep 2016 20:20:21 -0400 Subject: [PATCH 2/8] Finished efficient scan --- stream_compaction/efficient.cu | 54 ++++++++++++++++++++++++++++++++-- stream_compaction/naive.cu | 39 ++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..6dfc133 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -11,9 +11,59 @@ namespace Efficient { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + +int *dev_Data; + +__global__ void CudaUpSweep(int d, int *data) +{ + int thid = threadIdx.x; + int m = 1 << (d + 1); + if (!(thid % m)) + data[thid + m - 1] += data[thid + (m >> 1) - 1]; +} + +__global__ void CudaDownSweep(int d, int *data) +{ + int thid = threadIdx.x; + int m = 1 << (d + 1); + if (!(thid % m)) + { + int temp = data[thid + (m >> 1) - 1]; + data[thid + (m >> 1) - 1] = data[thid + m - 1]; + data[thid + m - 1] += temp; + } +} + void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // int n = 8; + //int idata[8] ={0,1,2,3,4,5,6,7}; + //int odata[8]; + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + + cudaMalloc((void**)&dev_Data, nLength * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + + cudaMemcpy(dev_Data, idata, sizeof(int) * nLength, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + for (int i = 0; i < nCeilLog; i++) + CudaUpSweep<<<1, nLength>>>(i, dev_Data); + + cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); + checkCUDAError("cudaMemcpy to device failed!"); + for (int i = nCeilLog - 1; i >= 0; i--) + { + CudaDownSweep<<<1, nLength>>>(i, dev_Data); + //cudaMemcpy(odata, dev_Data, sizeof(int) * (nLength), cudaMemcpyDeviceToHost); + + } + + cudaMemcpy(odata, dev_Data, sizeof(int) * nLength, cudaMemcpyDeviceToHost); + // for (int j = 0; j < n; j++) + // printf("%d ", odata[j]); + //printf("\n"); + checkCUDAError("cudaMemcpy to host failed!"); } /** diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..7db8a2a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,10 +11,45 @@ namespace Naive { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + +int *dev_Data[2]; + +__global__ void CudaScan(int d, int *in, int *out) +{ + int thid = threadIdx.x; + int m = 1 << (d - 1); + + if (thid >= m) + out[thid] = in[thid] + in[thid - m]; + else + out[thid] = in[thid]; + +} + void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + + cudaMalloc((void**)&dev_Data[0], nLength * sizeof(int)); + cudaMalloc((void**)&dev_Data[1], nLength * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + + cudaMemcpy(dev_Data[0], idata, sizeof(int) * nLength, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + int nOutputIndex = 0; + for (int i = 1; i <= nCeilLog; i++) + { + nOutputIndex ^= 1; + CudaScan<<<1, nLength>>>(i, dev_Data[nOutputIndex ^ 1], dev_Data[nOutputIndex]); + } + odata[0] = 0; + cudaMemcpy(odata + 1, dev_Data[nOutputIndex], sizeof(int) * (nLength - 1), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); } + + } } From 8e12ccdf6c07293aeb978ceb987c5e74c0727aac Mon Sep 17 00:00:00 2001 From: immiao Date: Fri, 23 Sep 2016 21:49:10 -0400 Subject: [PATCH 3/8] Finished efficient.cu --- stream_compaction/efficient.cu | 95 +++++++++++++++++++++++++++++++--- stream_compaction/naive.cu | 5 +- 2 files changed, 92 insertions(+), 8 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 6dfc133..1d95bae 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -13,6 +13,10 @@ namespace Efficient { */ int *dev_Data; +int *dev_Flag; +int *dev_ScanResult; +int *dev_OutputData; +int *dev_total; __global__ void CudaUpSweep(int d, int *data) { @@ -51,21 +55,56 @@ void scan(int n, int *odata, const int *idata) { CudaUpSweep<<<1, nLength>>>(i, dev_Data); cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); - checkCUDAError("cudaMemcpy to device failed!"); + checkCUDAError("cudaMemset failed!"); for (int i = nCeilLog - 1; i >= 0; i--) { CudaDownSweep<<<1, nLength>>>(i, dev_Data); - //cudaMemcpy(odata, dev_Data, sizeof(int) * (nLength), cudaMemcpyDeviceToHost); - } - cudaMemcpy(odata, dev_Data, sizeof(int) * nLength, cudaMemcpyDeviceToHost); - // for (int j = 0; j < n; j++) + cudaMemcpy(odata, dev_Data, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + // for (int j = 0; j < n; j++) // printf("%d ", odata[j]); //printf("\n"); - checkCUDAError("cudaMemcpy to host failed!"); + cudaFree(dev_Data); +} + +__global__ void CudaGetFlag(int *out, int *in) +{ + int thid = threadIdx.x; + out[thid] = in[thid] ? 1 : 0; +} + +__global__ void CudaGetResult(int *result, int *flag, int *scanResult, int *data) +{ + int thid = threadIdx.x; + if (flag[thid]) + result[scanResult[thid]] = data[thid]; +} + +__global__ void CudaGetTotal(int *total, int *flag) +{ + int thid = threadIdx.x; + + if (flag[thid]) + { + total[0] = total[0] + total[1]; + total[1] = 100; + printf("%d %d %d\n", thid, flag[thid], total[0]); + } } +void test(int *buffer, int size) +{ + int *cao = new int[size]; + cudaMemcpy(cao, buffer, sizeof(int) * size, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + + for (int i = 0; i < size; i++) + printf("%d ", cao[i]); + printf("\n"); + delete [] cao; +} /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -77,7 +116,49 @@ void scan(int n, int *odata, const int *idata) { */ int compact(int n, int *odata, const int *idata) { // TODO - return -1; + if (n <= 0) + return -1; + + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + + cudaMalloc((void**)&dev_Data, nLength * sizeof(int)); + cudaMalloc((void**)&dev_ScanResult, nLength * sizeof(int)); + cudaMalloc((void**)&dev_Flag, nLength * sizeof(int)); + cudaMalloc((void**)&dev_OutputData, n * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + + cudaMemcpy(dev_Data, idata, nLength * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + // dev_Flag is 0 or 1, calculate dev_Flag + CudaGetFlag<<<1, nLength>>>(dev_Flag, dev_Data); + + // now scan + cudaMemcpy(dev_ScanResult, dev_Flag, nLength * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy device to device failed!"); + for (int i = 0; i < nCeilLog; i++) + CudaUpSweep<<<1, nLength>>>(i, dev_ScanResult); + cudaMemset(dev_ScanResult + nLength - 1, 0, sizeof(int)); + checkCUDAError("cudaMemcpy to device failed!"); + for (int i = nCeilLog - 1; i >= 0; i--) + CudaDownSweep<<<1, nLength>>>(i, dev_ScanResult); + + CudaGetResult<<<1, n>>>(dev_OutputData, dev_Flag, dev_ScanResult, dev_Data); + cudaMemcpy(odata, dev_OutputData, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + + int total, flag; + cudaMemcpy(&total, dev_ScanResult + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&flag, dev_Flag + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy device to device failed!"); + + cudaFree(dev_Data); + cudaFree(dev_ScanResult); + cudaFree(dev_Flag); + cudaFree(dev_OutputData); + + return flag ? total + 1 : total; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 7db8a2a..f3009ba 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -45,8 +45,11 @@ void scan(int n, int *odata, const int *idata) { CudaScan<<<1, nLength>>>(i, dev_Data[nOutputIndex ^ 1], dev_Data[nOutputIndex]); } odata[0] = 0; - cudaMemcpy(odata + 1, dev_Data[nOutputIndex], sizeof(int) * (nLength - 1), cudaMemcpyDeviceToHost); + cudaMemcpy(odata + 1, dev_Data[nOutputIndex], sizeof(int) * (n - 1), cudaMemcpyDeviceToHost); checkCUDAError("cudaMemcpy to host failed!"); + + cudaFree(dev_Data[0]); + cudaFree(dev_Data[1]); } From 2b45f4bdcd37ed11b7879fdf6cd4c978a9bbf958 Mon Sep 17 00:00:00 2001 From: immiao Date: Fri, 23 Sep 2016 22:08:05 -0400 Subject: [PATCH 4/8] Finished thrust.cu --- stream_compaction/thrust.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..4f20816 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -9,6 +9,8 @@ namespace StreamCompaction { namespace Thrust { +int *dev_Data; +int *dev_OutputData; /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -16,6 +18,7 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(idata, idata + n, odata); } } From 6e819c330f8409bcad8be03132879b39b8bf22ce Mon Sep 17 00:00:00 2001 From: immiao Date: Mon, 26 Sep 2016 21:00:09 -0400 Subject: [PATCH 5/8] Optimize upsweep and downsweep --- stream_compaction/cpu.cu | 63 ++++++++++++++++++++++++++-------- stream_compaction/efficient.cu | 52 ++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 28 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 0af1c03..aaf0ac8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -10,6 +10,9 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO + //int n = 8; + //int idata[8] ={0,1,2,3,4,5,6,7}; + //int odata[8]={0}; if (n <= 0) return; @@ -18,28 +21,60 @@ void scan(int n, int *odata, const int *idata) { int nLength = 1 << nCeilLog; + //for (int d = 0; d < nCeilLog; d++) + // for (int k = 0; k < nLength; k++) + // { + // int m = 1 << (d + 1); + // if (!(k % m)) + // odata[k + m - 1] += odata[k + (m >> 1) - 1]; + // } + + for (int d = 0; d < nCeilLog; d++) - for (int k = 0; k < nLength; k++) + { + int addTimes = 1 << (nCeilLog - 1 - d); + for (int k = 0; k < addTimes; k++) { - int m = 1 << (d + 1); - if (!(k % m)) - odata[k + m - 1] += odata[k + (m >> 1) - 1]; + int m = (k + 1) * (1 << (d + 1)); + //printf("%d %d\n",m - 1, m - 1 - (1 << d)); + odata[m - 1] += odata[m - 1 - (1 << d)]; } + } + //odata[nLength - 1] = 0; + //for (int d = nCeilLog - 1; d >= 0; d--) + // for (int k = 0; k < nLength; k++) + // { + // int m = 1 << (d + 1); + // if (!(k % m)) + // { + // int index1 = k + (m >> 1) - 1; + // int index2 = k + m - 1; + // int temp = odata[index1]; + // odata[index1] = odata[index2]; + // odata[index2] += temp; + // } + // } + //for (int i = 0; i < 8; i++) + //printf("%d ", odata[i]); + //printf("\n"); odata[nLength - 1] = 0; for (int d = nCeilLog - 1; d >= 0; d--) - for (int k = 0; k < nLength; k++) + { + int addTimes = 1 << (nCeilLog - 1 - d); + for (int k = 0; k < addTimes; k++) { - int m = 1 << (d + 1); - if (!(k % m)) - { - int index1 = k + (m >> 1) - 1; - int index2 = k + m - 1; - int temp = odata[index1]; - odata[index1] = odata[index2]; - odata[index2] += temp; - } + int m = (k + 1) * (1 << (d + 1)); + int index1 = m - 1 - (1 << d); + int index2 = m - 1; +// printf("%d %d\n", index1, index2); + int temp = odata[index1]; + odata[index1] = odata[index2]; + odata[index2] += temp; } + } + //for (int i = 0; i < 8; i++) + // printf("%d ", odata[i]); } /** diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 1d95bae..89d78bd 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -18,24 +18,38 @@ int *dev_ScanResult; int *dev_OutputData; int *dev_total; +//__global__ void CudaUpSweep(int d, int *data) +//{ +// int thid = threadIdx.x; +// int m = 1 << (d + 1); +// if (!(thid % m)) +// data[thid + m - 1] += data[thid + (m >> 1) - 1]; +//} +// +//__global__ void CudaDownSweep(int d, int *data) +//{ +// int thid = threadIdx.x; +// int m = 1 << (d + 1); +// if (!(thid % m)) +// { +// int temp = data[thid + (m >> 1) - 1]; +// data[thid + (m >> 1) - 1] = data[thid + m - 1]; +// data[thid + m - 1] += temp; +// } +//} __global__ void CudaUpSweep(int d, int *data) { int thid = threadIdx.x; - int m = 1 << (d + 1); - if (!(thid % m)) - data[thid + m - 1] += data[thid + (m >> 1) - 1]; + data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; } __global__ void CudaDownSweep(int d, int *data) { int thid = threadIdx.x; - int m = 1 << (d + 1); - if (!(thid % m)) - { - int temp = data[thid + (m >> 1) - 1]; - data[thid + (m >> 1) - 1] = data[thid + m - 1]; - data[thid + m - 1] += temp; - } + int m = (thid + 1) * (1 << (d + 1)); + int temp = data[m - 1 - (1 << d)]; + data[m - 1 - (1 << d)] = data[m - 1]; + data[m - 1] += temp; } void scan(int n, int *odata, const int *idata) { @@ -52,13 +66,17 @@ void scan(int n, int *odata, const int *idata) { checkCUDAError("cudaMemcpy to device failed!"); for (int i = 0; i < nCeilLog; i++) - CudaUpSweep<<<1, nLength>>>(i, dev_Data); + { + int addTimes = 1 << (nCeilLog - 1 - i); + CudaUpSweep<<<1, addTimes>>>(i, dev_Data); + } cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); checkCUDAError("cudaMemset failed!"); for (int i = nCeilLog - 1; i >= 0; i--) { - CudaDownSweep<<<1, nLength>>>(i, dev_Data); + int addTimes = 1 << (nCeilLog - 1 - i); + CudaDownSweep<<<1, addTimes>>>(i, dev_Data); } cudaMemcpy(odata, dev_Data, sizeof(int) * n, cudaMemcpyDeviceToHost); @@ -138,11 +156,17 @@ int compact(int n, int *odata, const int *idata) { cudaMemcpy(dev_ScanResult, dev_Flag, nLength * sizeof(int), cudaMemcpyDeviceToDevice); checkCUDAError("cudaMemcpy device to device failed!"); for (int i = 0; i < nCeilLog; i++) - CudaUpSweep<<<1, nLength>>>(i, dev_ScanResult); + { + int addTimes = 1 << (nCeilLog - 1 - i); + CudaUpSweep<<<1, addTimes>>>(i, dev_ScanResult); + } cudaMemset(dev_ScanResult + nLength - 1, 0, sizeof(int)); checkCUDAError("cudaMemcpy to device failed!"); for (int i = nCeilLog - 1; i >= 0; i--) - CudaDownSweep<<<1, nLength>>>(i, dev_ScanResult); + { + int addTimes = 1 << (nCeilLog - 1 - i); + CudaDownSweep<<<1, addTimes>>>(i, dev_ScanResult); + } CudaGetResult<<<1, n>>>(dev_OutputData, dev_Flag, dev_ScanResult, dev_Data); cudaMemcpy(odata, dev_OutputData, sizeof(int) * n, cudaMemcpyDeviceToHost); From 22cc0f4014b6b183bc87f461da92170996dfdda5 Mon Sep 17 00:00:00 2001 From: immiao Date: Tue, 27 Sep 2016 00:27:03 -0400 Subject: [PATCH 6/8] update readme.md --- README.md | 27 +++++++- img/passed.jpg | Bin 0 -> 54143 bytes src/main.cpp | 14 +++-- stream_compaction/cpu.cu | 10 +++ stream_compaction/efficient.cu | 111 ++++++++++++++++++++++++++------- stream_compaction/naive.cu | 23 ++++++- stream_compaction/thrust.cu | 9 +++ 7 files changed, 160 insertions(+), 34 deletions(-) create mode 100644 img/passed.jpg diff --git a/README.md b/README.md index b71c458..bff33ff 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,31 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Kaixiang Miao +* Tested on: Windows 7, i7-3630QM @ 2.40GHz 8GB, GTX 660M 2GB (Lenovo Y580 laptop, personal computer) -### (TODO: Your README) +## Screenshot + +___ + +All tests passed. +![](./img/passed.jpg) + +## Performance Analysis + +___ + +### Scan Comparison + +| length of input| CPU Scan (ms) | Naïve Scan (ms) | non-optimized efficient Scan (ms) | optimized efficient scan (ms) |thrust scan (ms)| +|----------------|---------------|-----------------|-----------------------------------|-------------------------------|----------------| +| 2^10 | 0 | 0.0418 | 0.0882 | 0.0954 |0 | +| 2^13 | 0 | 0.0761 | 0.1681 | 0.1473 |0 | +| 2^16 | 0 | 0.4851 | 0.9168 | 0.3155 |0 | +| 2^19 | 2.0001 | 3.5169 | 7.7984 | 1.6069 |1 | +| 2^22 | 33.0019 | 31.4348 | 61.9076 | 11.3602 |5.003 | + +The performance of **Naive Boids**, **Uniform Boids** and **Coherent Uniform Boids** is measured by FPS. Different amounts of boids are considered. The results are as below. Include analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) diff --git a/img/passed.jpg b/img/passed.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ea271c713e7259dc0c0ea6247cf69870c432dd48 GIT binary patch literal 54143 zcmeEu1z1(jy7wlf1?iGTTDnw_ZiG#zbV`G?D4i10pyZ~zTj`Ka1(A?O1SJK5Z*4$D zkN=w(&AF$AQ%`JkREV=PNzU3AS6V@ zD~Je4SFT(^Mn*zG!$C(wMMWdLhJ6)>l!)v)DG>AIOaI!-Ofk8$_Mngp-Ku0HFry!wV|EGUWTR>Pyuqdz)I2cM0EEWtL7R+fI zh!_L|f`tR#_U8{g0vzHMSZE_5Fb(L>&o&SYEF3(-=_CjZ4!92!4if}|NuXtV@$J!J z9Fah3cO17mp~dTus!R}%p1sd2N81x^2c0Y#8Iox6tqUPkjbHDFzZiMVq|$RJK%m5N z8@Jzw^Kd(`zb{C@^>tt;it(K&%&avJ!vjgz&!#WqGdp;~Ln1?sj>4NaTR+w|Ed&{* zxZDxHU(-}%uE6xMld4TuhsK~?S(~8|FLJJZK;~xjqi-|9+a3IkcD0}7c)+NrFGDQc zzdS4L*r{bZ-Z9vHHWzpO_LAf?RmOsA%`uyckvCj4eBR&l#hDAA8J_=4?h?dDP4acj z=aa#FJ6&XxGz;%GGWKXB9!;*Hd*9L$H(}&r=l06yXI<@IoS)$jG^vdYjD95(eACQG zHu-sdAo*R5Q6+2l=a1n_!rISLrQSQ}buVcX(CEbt%or{a>e7Vo-FxR4_T_#Htw#cj zlWEz5hE9lsaEalA18&MNSAqf!6N>Q}MY2l-|FsBL@Wr}Jw|VzzpZ=oBNMD^AS6P9f z*%zvWCT&;7t$}{!xu!G@CO@00`^K-N$y)N;sgVl-HKej|xSz5ngj12Lj6Z0cpbB85%q z0>9pI>FSEc!nFFEd`MgEn4j+UNl<^!&~vJ;r|!V>G9z#BiNtXwz{HES^01+zz4a5Y zB=9qNz=L4%nnK0$$Y8i>GyaU+x?GjYjrzMcYNygJj}@@_DwL?fQ~4S=M{1!ZIKIwOk{DaU+e8N`0B3V}e_dPIdeV-ygR%PZ5Oht7sc=J>+W4o|sAhk^>BqIx z3%PNb!*!w!Q#CxDm0Np0#(K*`*G273U%Cd*#0dn;ZnMVxrzr?L?FV-jtX~be>DqaB zd0Z94lklHlPotN`B|Zgf#WFX{bf}UwRQYy(OyUm(C_@{)^x?-S&ktdCwD!s55RdOv zH}@ve^N**(k-22VXuV{?5$9>y>G+bmhuQj&4ZqWU>d;LZ^;)B7S zw~;rJI48v1tRs%Oxb4(j&Rzr0Pm;9_G z=Vze;W32T+`3W~}t}^@^Rfqz%zGlqbvqtTa`_b{+`ezjvWxPr?1U~A}C@L5Sxkq5b zK3`~k)CXC{yMPoA(5};bBOG*kuC3wKUp$jBNjL2y_8SegDD5Y(b-0C!2}X>MCXeW_DH2X}-MCIc4{L*E@iBYZvpX0}KE$iJ zSN2-gZkgVJxi0E3`;%}m5fAkdj?9B`zr^VdKH>%FOiGT*Lgb19BxSG&P$HjJvnz{@!C^EIKRbFn;M z+oomrB|isEzFmn(lGl43iV^`v6KMHN?P^IH9aRmXB7UaoM$ZHZvEOU^8JvP(bWp}~ zddqNMT6snzNA#Y8SP;Zf*D?lP&lMk4_JZ|l{Pkl5*pzrkTI#2Z40Q&4_*Rg(Z}-v|Ky8Kr z@t4pWyypr1l@How-#MUcKCBIM7C>801vL3D8ujwqE_1CW4=?37tU*=JL}YM#tO2Rj zF4ophcKWUKtK8S!a#a2c% zX~g>5Kxda{qcJH_Kw+UpyjZh;dtXeW;b^eqXjzPSpb*nW!}Sz2lBzf4zrQJTVDpkw zsL^A{e{TqJcVCovTnFSx!cXD+GLfF=WgWqc$BK{pZW@N)c1qz2!H!_ST{Aehuj!fZ ziFn@&thBnCwYr&Qc$gV_d`@yoQ8&nZgc6RtuafImFS@tu#wiokOF5@-KPb4eLGu*(%4h`xM{Xl6K$=j7| zB+q_>@E@y??Q?8FFmTBO4h9z%-~%nO00;EUaCX3dMt-8a1sqed&|^y!CxbOVV<249 z7&oiWjK1)LZ%%1NC^r6mgtyqd;%rrCs`fGrkWJZ7dE3(8UH2PFbNe&HJ0x^_qgI>kjgV^zsiXDmcUx`&+Tod zEGEOHvBe}62M$;q=Dc@yKb5VuyEg~*-1BGB3$;QjclGp}*?ZpqNxz(h_)dvxrw2)u z?36fbw(ARBwATC6)ZgYb^fzuE6*H3v*_y%*_B+Q-d1@q_)J$I`UEITG`of~I?CM$F z>EI~JzBq4qB6=OZfzM8^RmRnGKkg0c6~9hlj*VxsudlwQ?S6+&?fJyN{|)y5ce5?J z7wtXnHly+|wMj|CP)pdy29#8%AW*X3L$!DM6Ass$f)aXq6YJiRc;B-%4>G0e&>SOR z^yV|m^x-SmXiV+4E04{EC)7b>5)OYVT^-Gz%Vf-KP_F(Kbu@|LI!_98`%sc_nnYI zVF7JGU{gs%$k?{{Vxp&#TsNw@%aIBDUEU^p^b44hafHRwzcN(s+_^WoOcgrYV(mMP z9030wxdGdSyw<@}2{;@)9bEqthYPJbhXeKiwgYgW6UL%#x4k<*lH#F9!h?75dVS5n zX9&3TVQr4LSXu)P&DtgGK8HRBJK%$+{SG)@O>Z~u039d(iCA(kZUOx<5nvL>gITq z3yP862G&(d@2jSYjpd;SfYKgwN+{k>WT8{$e|j6=205drbeiWeTZno`GB*h<9)4m@ zO`3uQ%zHQ~m$L`pNAxZ?d0}UGDBpH2E%;EPCNKt2WsT0fSqTY2mi%m9C^0A_g5uCO ze3YRvuzUBICF9E-%ftyGwP7@s7-qAw5@{QhA>FlczKxk39dcjC!m*ZUl6()v(ZSzwr z($~iDc5UZ>r*ll7Wh#I0w;?;>v{AcRV3_MFPgfgYLNz-$=SpTVOB%+++EIKIFt&(f zUN!imcIh?vo8H+zkRxm29B#(J`q*blHb!QZ^DSIaME;B=L1)~loTHj>$Q|}sq3MI> z&IDQJXp>B#gXRjup-^xcq!3RP8CY}1Y0b_%^(P^~GW82&Xm#!u?!^s`TTtn=Bi09_ zJLB}T^Fg`ljff1%qg{Z^VrN8JFQCymqbULv(>ZSkbLE0CkuET&D1xxCbe1#j$q!js-Xh{@mp$&UJq5+s?YR9V1PC5F4SdVrb4%&o zEwa=>xG%}3s$7}g(@fvMDCVB8zFh4!6vW6W4q2$Idwhw@#54K`@}K1b z%;?9On(GUvxD?2sf4-Q2>^uGwwB!J28M;?mwA_S8x;|~ra`1z1zEtP?hv+4?yV%Cu zM>7UquW)UyO+%gfL9qd(Vxq-5S3?)8oH23Y?mthY&xt(k!uz`^(gIW9uh-N4lfTOv zz!bR(g&aFp#PexKhn)~In_Q1@&85mgl^q{kV;N0AAd*@t)aojqx;r1^iy!1XIw@V1 zn3@sM$N_ke=o1~+J0;$(1wb@|Yx(0vWC)Z{09FDJ55T@Nh6gw;Ko*eiGn^z(%LX2J z!L|TbB8*=v2KW=ogp;sDy_ZH@{tPUu$o_7)^qr6mnl;!j1UD(XzYr$x_q;D6V)nMg zooi9?V2jZJk5}ghU>WO0mNf>)j@(DCgBZJr_X4;GyfQ3Hp~3@R1T>iRe?f%9i03oz z92*XXSC2MHG}d%U+qmMV%S< zq(8EfSn#?Bpez2%f~N>LCFJ% z{v*#cWQ*!vso2ThR)UtirGpEW^%!c)c{ftJ0rd{0DILpL;eAi{W?*vqi~b?>n%5Tn z_$TZ%YqY?F`D^mxZ-U>s7&fnR`c>@peZ;wyQL1}DEwsw%GZNfw-;!8(6O~knip0fm zc+rnj6G2%eaUH%^Uk(pkA$s3a51`hEt4xC5AH~q&^53yy$;ZHo6+8aB7Ul1*_2m_l z7f4b3dOi1U=^aId`%f#^5khyxgdXwx88A#+j-wK;;RfbAp|q*gK`Wp=a?%>m&($&;9}bi~&$TLDc5m4%5T9hrr?wAVz_3Tv;PRz6^O zxh0Yh%MZy#d2aDW#cMV;lfg*_jC)oA?UCD1S~N24ELusSlxM?aS+TD(gi(ML@^(vi zzO?y!f9DK!)`d84$ALH z2$&yJymf6ZMB$tUMe+c`13-o@kw|`FOhQe&L4^?j)U1-<*gT+s0O$`37%@JIOJPtW z1qR*ZTq_fOqE9w*I61kHx~L7^?LaP=AKGdG!1CHc$194uOYY9ZjBfzlhcJzhsa@!I zUy=h)r3?yYcEXt&x@aUX7yy_vR8U<6w3E-X_Q0|d$+x=gGUmm;)8MiODibxXqQ`5D z+kgTNQ0(;HnB#O+y_L0v$Yx0};@|pMxMf=fABO?`?yXnmH-Dn%z|rA_B9K_!1#G@( zRO~TagoAb667>OPW&960s+5J}je4ao($s^i6$-?(pcT*_XRH354TMW^ZI*()zc~RF zG@|HnUI7=vF6Aizn28FDVb(cLdJ>vdxc3hz#-5OZNB!i_*7^IgLMxy>l%Z=%s*W|r zEUCN(th?C9gPW$!?gFIHbq6|#W-d;YpO6B&$^CEu0=F|6*R}g+3m*yrTqhIYuPl{- z$(Fy0$j}p8p`lL0<7ZavWJ$8OdB}3HB;l1!vE>S9noGg5Cd)3e3s__S@xU4=X`a-`Ms{e}RzwuoU42-5L!@Gl%m6Q7jdx3@fi|3>)A2p1gV~VC_Khu)x~>IaatY;JaY3h<{b|{q zf^05U^`131u&UrE25#x8FGjGLSm)&2?EKC@--e(l zk#n;x=SkV-*2UZzHakZjN#v1~7LEbZysp}ke+q)=dxZqMsG$YgI+-ux?<=1l2^X9JJ_kJNU?h7C=@ z=q1Uv2Y@3jn>7nx=*(3j`o#8~9RaYQecL`x^$IYdaqXQh1IKUR@D=@np9L*lGaE12 z(f}R+rYX#*TJ4MJj-EZ}`Z)`k<1^|5K399p1E@8qt&oeS=5ysnFkmP4-)3963<~8i znGI(i2x^y!!;E0W4-_nde9`$guim++<=A&rD+6m*sD)De4HF90z?z*&Jq-ZnP$7p_ zKzl$}^~WLtC_uUAfnPq`n1XSJ0{kVEVEgsAryz_A8O_+Z52&EOFjJvWK(%ui-u3qk zdB4qdf#~JcVw%Dxu6KZ(15P~PeLlZ%C_6WyM(dgVSp>K}qoE~$?4LQLK1kMg0b}-0 zP_T05AakF4vu%Ly>g1c>oU<(sb@m_om&09h%)Wh}NaUeq-!IlH;G`2hb_QfH*k;aR zft5Y~0B3%jE!zP2C|f)W)llsd&*fDa)Kqk`2yv^i1=L03SPs4;UZi~$t|xh~hAPzl z-3m(pLKtv4#eD$8lS*7i1>CPi^-oE!2%vr~F$N4OYM=6%=ezkUPdvHc38lR{fPCLs z_smZJ5i2<}rp{FK1Xq!FaV9}NgXWO2Ui5q1^v1CQhpCq}KX&a1M?{}&|fzleK$ z*w0Ni0z?I>ApysIO z{M56H=en=xJ+>InR;4`wp~^QDJ3up*0-roCsSc)mlMLl#h81SIx=OJ|ZC)U_qPsSV zT8AoGs|AFHOqlYGZPd{JEuO^5r*ozrxxe9bGwLs*EKo>NxG$IM5%>3eFQ`2&FYsc^ zH2UsSemuhaWG5A=^$~)&7D%%RAEKGq+Q61Ufrt!MHErGpuNdD@d=?x8d|Fg<{WxHF z@O?SwP8O?Btb&lBQxI>9d(@Yx56WM3uiv9dOF;1RezkC%$(2W6RPm207;$)z`P zsYxQ-{y-D*SyS&>6E@JPFEvSs+xx5m8%28(dLa818Os2*{#Iyjp(B|L_LZJpLhr_Y zKr7||!bUys`cIr+&6?ytAJ?W-;(KVy(!2OJxt=4=#QtNyPQvT*|9w@x;%wDGT7eo1 z1OtZvq!xfpMHfFXuxwb@C=~SJFv%(5u@za*k_o^*B|(_H1a{NKXSA{jbP$|wG2k=+ z$$j?GGc?8Uqi1;OGg|4KV1_}l+peDQTNLxgub+LnZ8olQM`;lZ@l?)>xvv7wU(2VF zj^R70IH1fkzB;vVl2YG&OtOCqh5T8rB4R&8^qLa`d14Hvob=)OuUJ21ko>KVOsXU? zW3D4IoGEyP`20lU#PXzsT@8BuPZzoWc-!x_u;nz*nO03@_%(+6V;##Ozg{piui9L0 zGQa#|76?t*KxqpPWN856{-iAy+cgDD3Vm!!I7L==p)=aTLTT$OilrzelCBg)!yC=d z&rgFatgy2F^l6QlNbwM}K=x>C3RqJ{K0J2_i2*6+7iDaiCdAK}x1FZFuN$AdQ>53q8g>DET8+DSEI-=vR2s^}_ng~SS_ zZR?`voakGh70NtN+)0d8UniByHGWz;dREkgUPDi0XO&~nmfue2_x$L1srPR!=R=%T zUK$N2N{R+8G@`33N+Ft^odyjkbqk7%i>A>@Uy~-Jh>VQ$*Cli@p8NGrd>)jzqL={V z!ongTAt3-6F(`?F{c%{JYZPqQlpI1)nB?q=`cF!m;W_Q>9lRscsD#1K%5W6)=2#7= zhwg5Pw4~2}zQ~dx3c|_^P}5sngDnNAHm6MCmYD9fKOj6V)k1vvK()CTsIUYoh-Puu zHf&LLAJ!na)oa5x>J-7~7EOSP)hr*Y`7z@X^BpgzE$z_hYn!W6}=kcwBKZK1s30H!to^x;VO z=2HZ@?(K(BkEL2XI3>DBCx@h85WYYfQwq>e{z=Ld4)!q-jj+UAHqmo9!K%bJUG)*)RmhnQ0)fQ}GM64Z4^={Isi#h=!W*1~A;_()T9z~n8~r{=iri*^U&M;Oq6Y;c^`hwg zr08rVH3Nt?#-fY6i{{1}|0;zv^;}r?=9n~54}JeAwv=mGY@5_D^(iP%xJNIS11CFL z0gOl|E!TZVJZTkyNm4E=m9a{aZ8tqVo&2%26yYE~$%k~?JB`X}*rT`1a(AmASYzQp zaB+N>?Im++maizxZLH;6cTGCU!fh0iBsKaARJsX|Q=?%TuH#hFo3M^mW2xa{6+X~3 z@r<5sM3A=kM(2;ijheNXWuH-ch_5jt8D}lZD;3x^IM^-upupSv2G+@H)g&09n<0kQ zt=?SdBc6|SPEup^JGzxoKRxWwuIc9%dA4;?)~uP+-cqf_WwAVW!S{~kQ1R~880!1T z#|nX?&@2c9kJ8gKbCpP8Qz~Jf+0|X0Be|8sAe`-jB^?shD8(7ND8eWwdYqF$$q^l? zkqk^RT^^*oZcCVIdb^}h#X313EiqYoZZ4lyx$OJ;y{SfzJDM%MxYqfB4#=KM0NCp|H{7BjH^n~%1r$Y9_PDLnl-!E3 z?U5PekzW>$TVPM@WQu&5fuq-_XAVIXm~I3E`l+fbbW#yXc89|-4;9|`Y6Yf$$VX)} z^L-U8o`9@ihC@v!PaMXZZmS7PwSEe!`z97gK|_aR)8XUGX@pv2{UJaR0-7DVCRg<# z+=?@doq{;Tg>h3#bI^}p@J@y5d^30SS{B;F-C>k*9B>G$EZAEb`w6n9mLJBUVk7Ll zPflRhfX=Pr5p`W1MS!Dp)4^(qQEce}x>s@_Sm8R?KBVSM4auURjZ98SPRH;^)oUq5 ziPgL~#Gp4<#{TM5noR3%DVqHRmYLhhX-GH1oCO8(9W5QRwe)5~i=HvgM?!Fnn`cF7 zCJY8~bduekoOra^PNIxe=elal94g_X3{DcK(Y$L)Fj!+#lBMS2PZe0?hEobH=^5CE2yz|%$de&EEMjNvB_C8{%QZHmVVlnd!}1cR%EkjcYz z^E6w7uxsg`Y;Y=PBSuFU-GCvDPA8F%6ql;L3V}zAUHXzncx*0H14@g)+Rcn%B3817 zr5B=sLz0dPRM>$MNxQd^cv_T z(&yBuOk!neYtg5NccmkeoP8xm@?u52aq{apr5%;RvL(XDL!3iQHjMhYFvDY5`6vxN zCZtHnKGjLio`Rf)C9WJ5MY#x0J4QC2tY<#Gy062zsxC}?vE+K?c;Q5L_K-;;1)zLmp5etk@JtcgNP^}x?cv5nEajM3S zf;q?)HMJBad!LAzTZvJzBj7?RaYDOriMsEtS4kY)B0U9ljmmSrbP{mv>|2!m#(AK0 zatdNYuv~eWSs^6ANjfCF366Qpai>0qIxehQi@+0Ovyq08rN=sO`s1yponfb_tfG}n z!WSbkZd}daZdI+NP~#J{POmSVItXH>?DCFhLs#l5cedX1a}yGgQFv)@Hu>5gjX;iA z!?xK6Im42LfK{JU2vba#NJC<+8ii5Df}Svf-jW9%Lfs|Ay9?H@B(CE%J%9fz6n~nH zi@7mb-a9@Ubf%15Ex(H?UQ{9!GW9DYUJS|~JiapboNgr!c|eT;ftktA7zY77_~A0C{EYDaYwtB2Ax@?ltX-6n^mvnm)B26@)oRQ zoB0+_c|7`cgp4PKZEIvsHv%Ma&kZI6ra^XoQjP$9-V|@x?3E2RZ`Cu=iF8S|*%@a~ zZb!FO+6@Kk&FOpE+7m)|eUG;@EpiHzDkSV))S2^lE2%kx(FV+4N78FNRB{yG)SuqZ z0;jJj_YIkR-I$hpe!WhrUYWRlUoCyMjIhbJ-`seN0a(oFuV37#{T`w3J5OZ1!*#&F zOO|LkM9sKmQ{Pl5(ogM#09IX3Y#ru|Kw=PdvudXilFK!swyCd5kjYhKW<(+`KOC5h zP_b8yD$k2BahF%gu#(1Dy_qCN)?q7^jEmC}=c>i?ZDi{Hh+hr9sXNryvW%mtU`o9p!Fk-O2ls z^@zxW{uJaD&ungD?>qE|#l|`3YGq3r!J@Zhvmb$EFICsSn}6FPI8Mat%nyez7M0^{ zQ5A+qZ@x?9E}Qc~#Qud9Fh#MXMpdgN&{icxYvkQ(73j^480q94;{c2MHB&HoAZ76I zWU#mq%1!1eY$qS&4!VZw2Xf{6( z7{G#FJ^IT+!6F4n=ttS1$~ZIv)?cUUO&r0UlS2y_7cIK=Rrtd_K{ou4)uG)nN1UHUHq~e~KtkAWXwmferQuPkSK3T2sF43$* z61GdRR#}O{J>Z$15epDLbPjQz>xwR@r>~=OF8OpcLa*q=V6sx~vG*yfKU{};IuCvN zD1XRS18s)al(TFt#%n>^<140WQ^=u=DE0@rn#>&fN+#d1{f8s;P7E{r6h2ayUXXQd32 zwGB(o__4=<(=}Bwc}#axDmmwe^QRbD*?C;-?;>s9<<#Q|O4gG46*5vj77|NEctsX@ zH;tyc1qA1YC!J}=?Bf%?N=QNv_Aj7*AsH8VuUHU$Qg7ttQ~EnLMQ(2Yn557&4$CgR zIvmGTowt+ioqrqAEQy0k-%Y z^3B1M!6vLEewTIJ1vymaAkmNVZ5JCj%fiR3f2~}=t0?CaHama%FMyweQ&EN+eM3O_ z{4w8O=&2lomZEG2?6aIg>!*anPlIfeckLIVIV&;R`hHPo=kpJwL^^2!bm8rgeWhjH z@NHKmGIRmlRS>*mzZqgE;%AN!^Iz8;TNW~!6C2};{`S~(Lyos1Z?C5pE$lt+cN7L$ zgGV>k*=lwl4$0i)k6l*Qk@LfSKmEqn$8j$?|2x|iueQ8Ht#hZ!`yn8iuV0lr;^=eC z$-h@)6cR^11vyR%sWa=qQ}7N{GwevAd?Ws}nckY_d8@sCYe8C7CwdyI{*ANeYIIIr z42Ppele@E$sYFqf^%&x(6c$<1Uy~@QQY_OoZem8N6>B`yt0&d zG2jLJ$J8yA?1WLnqJ+OsvXfF!vd};#{SUINBt9o!Ger<%g}hhoI|Xs>O?(Fg)wJbn{NA~8f|-IrnUOD1j9DO}6$Rwh1M7ko z=Is!*Dpfrr>#c>ySnnTS=-dlDVkK_tXU-fFe51a3A`$s$&0~G&#D>A}Uon}!+?>Am z+)WU$m;k3*Ez|v58*Xv>Nz@Sh!JxAkYNNC&sCVyEYLqVb^aK4Jz)ZS?LQ><>iS_Ac zx^-m&L54=dCXp@ffCFlXec}6*-QP&I^d$>M@B}(8eN}9lYa8t^2FFi(>740tu8Z|3 z>cqnD4_+vpcq{QwPYEwnr>1gMdP3;mwVK5K>kXu;?;tdh@$J3x0AL9U{I!8Jq(9j7 ze17^s$!fjJg}2z4FP%}wZ+?Y4Sl=1}w*rC<2XTUr`dqbccr|T$|6k_WK*sxG$2954 zpR?Y9m2oCxg~gBR%`Y1-2x>IujQqKXrWK2t+@qbp~U z%&Yap`JR!JN{(yW)6_hrd#hLH&8cl#{;c=-$l;w+i7uG%@q1c3E7}VPxVJZc>l75Z zaKA2v-w;^+a~sJ0KVt*!|2s_hGuV=KS2^EM9PX9T^1j+EWBAoEontoel-QDo5a>ttcRpC zZcP06BMn+FF+?H8Z{>!F8v~r*CrF3nD)ERVEmu4JbOUrlyt_=kw(>#Zlz~J#rP6Zu zg)O5SZmZI(1A1g$EE-YV-1Oyp%;k)EH$E(#iF$mm{h{&c-lb7L8 zNTpj-x<3!9zE#62>275bpQHq+WFE1~EtKA73QXXPc2H>)%9mQ=;!x*b&xL-fd@ z$U&kM@z!N{&Pvu)aW^(W6yL;xBHfK9==3ha$mEufWR=9Msmo7dzxUFOK7AR)=eyo5;_p zgHo;BM@q!vrUAY%t$Rc4#PlMWrswY-PxiVOojvf7Xqn zUJgHeECHLdAVG?VDLVCsIdR;?Z1rc;VFaoc&kgQ9%~`EQg~mTXT@_OLV}cU<+<&83cB*WLb@VKOZ;<2tK%W+Q4DGZlXyYP)TQ-}zcWwksiu^bAj!C}g^Yf|By*VM|A%@{IUnWKe4HHupxTzc-@? zkP7d_^(gK27gV-{nG1D$>b8D=ZFqV{zV~6T*Y@ny-?o^Np|BQ=@wdJBXjZ84V&>C> z?>-AO&;`wfC@og#97Ov4ua)ZG0Eq0}n+vmc^}Cli__kGw_rvu6UMc<$-u$nGe7`oF zzcSkY19iVgM@PIlNxZDA#CUv9Ylxdd=SDJ~SBZg_)2R>Z9e;ezY(d*kU&jKU53; z*q$2?R}}4&C~IHE*($lvVqn033Yz`(22CCYo%l))AUq!7AQg;`8~rR6_+C@n->M}C zGV|B|QSZPvSd0UtV;IfAh*UWaEc{X(3y)NG9hmr)bG3vc*hodqjJ7znSCrfK2yAOUBSS#Ae_}dV{7_|?J7QzqWe`D2RNjh^2 z{7-)ocM84;TxkyXE+S{PTJSj9{)pP1`{zX>EObu3gMS+n!G!?PC zlYa{Rd0d1d@MAZE>Y9B&kR+ZtVhHE&0U^CM$G5iF4NRIe=+`LQd1xF6C>}rgX;1wl zv9_NkC3&GI{!VP*jgCaX+h#5_K|IRoQ0`IIkxr;%nz>V-KlBNuHYD|GAJ@uCrRH-M za*v!kNau-aG_}~!$3%>kH0NCDSA;L7M~97PHR@1U6SjUhS(k0godD9h<*3XSrRe?P zc-@FFn%S@e&)llrh1(=*Wf6<{v7>WvMVlx)mfn!4g6ckczg47RU8I?SseeA8{bNG^ zGr|W1zcDXh<{I`F8dKDCbEL5Sgb8W5!L=-Go1i7`?@;jdsi3`;s6Ov5&~ecRfcwn@Gt%pZX3;x+(ee)cCO($BQlXJaGvc#>WgWj~hkH{u3; z;|)IBVXa`%#om%tFvJSwZkrQ^aijS7R1^m@Ac{e6* z$IxC&&Tfis+3_{M32)Q7Ay-{YF+1NF9l)$zm5oH|?D9Q4ZZ3OMQaox){F}hLAnDg- zN$EW9K*p3k1|OVd9l|qUz4l;bcUW#Rb$X_*5g#F!c9i0(yP_1ra&s@5LA9+~-ANbzXInw|m0N``{UbGo8OM>@z{{qA(i_@hzp{HThU z=gK}DllFzg77ET%9+Pw@9Nx0CIaxUMJ#Swwy_h^nW`0$lUB;g`dV5|%L4`hhC*U(@ z6wlnB1J5K8u($r4+_`ZIivNSja^1<29=$^<`D<7h>o6Z{eEicU1BL4WDy&H!_by?h z-O)1(D&$UqpKX7Gf8VFXqm?xQ#32PY z|68oApODGk3XzVfpoha6<@=Lm7I|(-J^vLDJ$}O9(oZBMcY0mvOzrh2%kgs3`+IWb z+)}#84~7|K?@m#DBmEm2!|VOh4|VHPN2jv}Eb-6!mX}AD2N+;Yx zz-qJ2VWt3`2-~d{p66(Mx@x5v%U-=Wxya#M%>0+Ms=YLkW`KdoMV{(%(o+-*-plUy zs7WI^=;@0cxL@-L=b_x7OTZ)fBet9jRz|hdc+#}S7IO&#cHF&-3|8Q_<=pt0RkLLR z$rxC2?SL;oLy)l?-DxjToio7&&)Peb)`PBO<-Yu9>u-F7m(ui&6rI+!3%RV(i+6Ui z@H45TT&NzASy(l?UcG|*F&mE^9CT@|{%*+e?<7m!ssEp~^MBUPpVL*}L#zL>weu$z z>A!k+gc?O04u6Z;nbUXyG8YhV;oto?B)&?24=n$^ok5qk87Rp!Gc=Z!!%;c=eNTwq zC62DV<<@I`N2db(x*t+IhOeYA=3M=df_(Z;Hp6$l{v!#TZNnIongI`NzcgpxVwy%E zkUdbUj_Dax^!A$Oj!;Ea6GD>b*yw4zCj1idsD9-K!2I+h-R?B2u{pwg&1xjY%U6Q5 zU5&w4t~mwu4EqMw-FbFt>p-Gx11gV@IJN+ey5r4km+D72|3i0E$o_6=DxK`J{((V$ zartw`rM+~@1e@RdEf5-@ObmHRnR$Dd-Aea+4`BqExI16U{9MYTXRg!8=?qD704w#~ zBPMwLjqq>O?~b>BY}%uRKkvT@{?o3W%!Vi_77ZgsV3Wng<{($=H%SEq@@!&@^#;}@ z`n>JG?sH3f;X){ z3JcW%o8Z1DdVg$6`niqbN08*FyjhVK;1zjeCYw&hTx`Yrp3=7ew-^ER|66zZZU1f= z{!jDSaaK#=&C6>%(cvF=lRv}97&sbPZB%>vwBShr&)Cc1Loq)WsWz`~UPrjAs8$SP z2=r1!9kyGXaF2rY6%`cFpZSwIrWSO{F<9|B=L{EV%9^B389j;pATK0cr2g2%xkGC^ zJEDSy55e2emlG77;5McpLmYt$WC)Z{f5jZRMP zWD{dYBZxeK2mCEA1L>lD^j{Qjnv4$w9-{%>P1;%&Mz6Qy6m;aO^r}6!)3fz>Sjno` z7YCl~WEuFv;76Hm;7KINf_pf=+M|9s8%l4dq?JsTab7%5=yh^Frhk<+9;+v%a%@5qG>JChh1FHuIAdrRzrg+*%t<8uj0E=$32}7OOjlP88hgupB4- zz_pI?IdM0@rSJbb{0nqg0KN#vhzb0o$IJg05(I+`bt-4i!c9pwSeHU z;#K_l*zHG*uie#R_lglVbQveZ(NA)>XVx03D_x2$KI@JxH0HOvt9hTbqE4ppoaF7N zd{!u4`m9Sl8BThV-|VgywO5S0ArS>6y1kx%mUzYHbf<2+XjG7Kt--X?gs*WK*Bv&Z z7R_Q5465Y~%eH@hlr$$14fKvtztNi(S{3Z$+W8*hs^_leyO+y`-KvZxoe=*rnv1e%vHN(374ILw=H)x$L#3Z#mO_9X!IfIT6c{6`%xD%cM zF`67TrI|lld+UMe+>L?CN%0!&U*&0u#!UAmtuL?pMzS^#*=ieNwZ`6 zLAFGVAVEw5qjE!f2^4`n^)Jw|-l zd!w+|7U>HjXKN(oavoMCgu^sqnc>p3h~jr7ii|(u6PA8#*0Prt+IIt&iX0El4V8b8 z4a-|J;NzoQ3zGgXvtYt^X@-Yj|6Wf97)lw0X-0*&4zYFx%QlI_L#ZB$L-Zy>RPhq+5=EgeC@?KczVYQ()Y=nE>{&uR6&+Wx^F4!KKKFkF$$@dbMzd5_;@3JDWR72v zPWjn|c~g?RU+OEPX6E}=LjRi^YLs`;GB^UK=ZofPu_X5_iKM8wdYSPFF_VtnFyED# zYfDukQK=>t$rw>|i}sN~@JKi}HuAhaGfb??D#PYp)$4uqwc;T&H}DOCZY7iB9+}Q| z_05D-qkJr;cCxNW;r>2h-_&CqVgD{3(TvU6Y|m26{-W$mHJLTVZkqtDQ3liu7GwJ( z9CqqvROE*?aHZ1r|*!0*4UrXh3VCms)J;!HS;9}L| zY}~Zp&B-8APbtc=naC&HOQaZA)v$iiXO0n~K)pB__boQt=GL)tmY?LK&(3avOsjoY zi3Zu+QRGcbT^qHO2y%?T<+hWB&xL$BuT8HjqCQH`jLxm7q6x?lj_%}pOp|Flob4DH zM8V#KCUzGEse~e^)zkb2DASi5e(F|5m1UtB7MU^ZhbQr4GwF*gVZNXg!cps+C9 zAHj*bCNoMljWGMM9a*D$L$j7Gkn;6IR|pA59Vy(^*r0M50@-=#M$vITjidC3yi%&Y zI$AJ!;ty!Q&iklEr@2q2Og_bocr8WiG8xOtYNxa4m3Ox*5H7}-CZG#z!S7SS_~4 zcV-f=Ro&-6aiNGn44Ctr2y)Koep$Cwy|2ssK85{cZ8>aPGtcH5sn;y<1xEaX!m25+ zWuU|Z{F~za3D5nR%fB8&eYcn1^Ki{iulLnY_?(Rt;&m%=Yk&VPS2NJdwZrdb!5Fe!q@bn1)ZAs3>7@L|Yw-PDVryjJ_Z9d-?z0T98 z{kh4a9Y4}J6-QU>eQ@_9{ORj7$AcEv7Y2hhT;kQ#l$zw|Cj@>RDmms_n{XlmyIdTPgIdtGnINyK zN}@E%wwdM=4>C&0wki_BAB*%`;pfm66lAh}BVJLK<0*-Hcv%YGrq&E_# z={!~&+<8IU=ed7B{;fQn)XQ%WYtdUztYd01KOxkqE{N3Ntx7IP_XP?t>>h%e;u{%b z-jH+mY;kgIs(mRQxP;;O?}dD2sPB$X?*nfX4Xw90!OT|Bo8jPW6*tB-4M{+UF9v&Sr`J5#7e7@ zxEG-xM{H0+6blD2vf^^?d`OZiHVK4l*3+akt;ThsMbc+v4rT{yQq3@WaM>kPgC{>1 z2|B7@L4gg)RF2Y#D7sm=TJ0y6Grn>~NqyLhrkMPwKBjm*D4VWqMCi)ZyxuSK2D&R;c#)FH2iSYGQ7w^al5Vp1(-#Nd!a4<^O2r?* z-(DJW^S0>e5g}_X^*3oJJ8<)cm+jPSpD4U1Fd17G)y?&Tqg0m&2p&S10!3>^eM;sR zzu5@K;F6>fKF@8U<`nB}{VehFp?VUBC%p_M77JJ0mAvaaa+TSNF3rGyh0}Dx4<+Lw zibcw>ds+!?+Pz#hd{v@KGR_r8+0Ze5&P-<P3-Ovv14nUWjubFNNpW9vIGo@{QFyw8aXLO#gM4B0XlP{%%Kc+FdI!r9vO z7GhrykN8#>(cg1z*c+72JRg&l(fkPDsy;wHeby^y9sl;QV03h8d(-y43K;ccxm>M=c%Uatq7_qoxz4 z2QneZNkH83Tp%yeCiy`ta#`ry4E0|TEh z$1NofNwbjffcfx8A4|DS4$si?4C7XkEalTb`z!@jRjl145U}84!V>>LdJcb=%jYn8 z&S|S>EdBFz#?XgvwwQ1O&IDoiyaGOmks_kcxOpw0QSufB%l(cWdPT(^tc1y=c@a@< z55emRQ2lWsEduf5+QHkzn!ob<>Xq&w{XdE1DJbpmdgOL>@no8SD$aY~iNA)JcJ8g5 zN}_4dnkCV_g=!OLhab!l7u6C=5z*Z1ae_Q6_wqaFNSYMqcJWOmzEKAU$pTr;^1D> z(&g&>1ch})$KWWLGTipv#^2|^N-qq2Yw*iurE|{XRFRC_w(ipK5{?wa4OBv2@DX@A4n@<8V!-+Lh62tG5HMMRr%-e(E4RHEY(;zkt=x9fy zqy4ZqkT`nF;$SCRq)cA-mIqtTKW+1B88#7sxyo0eo0?cw7SXw7leP+fpq878Lg@ug z-&z$u)0)=$PXa8Untl`#i4;e4u5CwK)*|9-8aLW`+WZ4+4jwV;s&-7pzdysv(t=vt zCGP4h?dv)2<3CPg;jqo>4r|w_3;adSS@ai$ZDA9Hj%By?&Fzd?v8~KUby+GZ4bBU^TOU>2Tu3>YPB3{^-w+ z$NLlL+dr|)r?`01u!n-9B~G1kVzqw~k-9Uf6z2M2zt3>hi}OS->wW^lnGvjboZkiS z?*Mg(4kpcy4kpjY$^4n0_-e`JR`r$0E1of0IG{-Gdl;Bk+W{`gGy7oy_|!OtOZ2t( zG@uGtroNV&WkxBPao|1Fn{6&7gR!4fI1X#>Ufs?rLqyCu<)e0>m7K zyH47E#z8pMlezTtIl{Y;JrJkfVXq)r{w(R^n8ox&QkH%Sb?NI|s=J0>#>0akZx1j>p3f>n99=}cc{ihO|J`wr>E=8!|O z1&GL&)Unm2g*VLr`@Pv#%D`pdKIi9tD?$X%q{Fv0;eNsjULSS4+9(RHe<@vBh*qb( zH}2#pT@?N7ra%MjtLO9ou#WOe1x1Gr$AfF0%Gk~KHz;r|xN}`4!W*-pvvfKJJo&Ss z@7eE@boekBlgxWMqI44JJPT_mV9g|Dh?BriY!um7(-D(`j~`%-2$kr-52$g=>1%|O ziX)mfu}P;#e#Z2TLviIe5xzG)aA7;=a`w3X^xbis{<(0L6+~$%dcOzQsY_n(WE!f6 zu3Q^V%NH8;3iY4M0tZH^ts6D;Bq(s|I3m2fq`#$BrkiLUYr#fSRXYCgis#w!sBL!c z6SQFEMG`e$l)BiS*l@0U-vKoz>T=_$Xm~rGjR(?e4)QC|8GG2JsQzg#8XMutb!`WB z$B{Tip={>4Ab?(o8K!prh+-HKnpVO#KRC?JWfe~>GCP~&%01QVzE&J{)N3Zmaip=Y zN`v3@ggbsDoRM=8y$_(d5oybZPK)fcxofk4;r<~$%1MnPh&Y+Lkz6iW7T#A4=h>?` zn>Q#*`@Ao!Lg(ni13wimQN?xh?&WyF5181{jrQTBzh8GH<|M(D-vql~w%^cO%%@B7 z5L0iapaB})E1%O_EX)RV=*$FN%7=>B$U#x@{TbLF6>rkF!62=sGwjE7(r*o+I&SQ8 zfT!~Rx{BWf^ut&$FU&n~K&JvJH+CK);nY=N-{wdx^I5f7J4TPCQ;9`Q2$Sx*#Yww<2~dew z*`4tn;7=!uM<#f16w5uIH~q6y{q>nzh=cNf-OIPppepBcEPrDxOt55wU0EMSej6c?#)SaIMt%ekW5@PF*?lShVmPyuF?DbH#yQ;bNi7NJr$gFMS)|~oKAOV!cLs0 ziNvb>-ODo9F)jH_|h|hbM^uULoE)pRDqc zY8O_=xicp}P8y>YH?Rqf|E1Yj@SMcA|7(a6WJoMP3yB379f&68j|nH!lKS{XE}`g6 zV!CETFE(-bfUmFO9uS?W+_~oR2Z4Z$ua~^f8_reP*GC6c$4uG#92rDg*$~+{3tJXU zSoPrsMu%Zq58ZSdx_jvF7HD0eYOis2*g5mpa^QC&z2+BYO#M-mR$;mdQ&N~dA|raH z7~Q##t!B)gkIi0Fb3k^gkw1UPOkq1`g`3^Iu`R&*YUE?dc5F4xDwv@AQ_F&*c2WMS z_Sr|_cs#C`(b#@o=e%~EmklTX*v0{@?|G+O#T59Lp^cPFP&qAHB&n%5f`^fWCt3)5 zi<~zPkmmRfKtAgr9WZsn?cX7rPUes@G$*)P;&(a4Qzq$@x;=@-=V39_S~VtKVF)ioU%W| zrvYuL_HYdQMyN{*{z<%g7IvL6KZT%rt?ujJ0jLJ5cX+|DsN{?^wRIBY*Cp(J*pdsS z=Hua8W#b(7av$KM3S=KRsG+utVHY^~7I}8uMpe_D?PoAs0DVlATzeBJ7R7X*^y_(8 zgP+-*$DOiwZ`UaHQAzxFL-ku{8rOI>;Jwm6yX?%6sQzQT#zE!P*Y&Wgu>14qQK|M1QJ9g5~hc$`DQ zfaeiMANPBt^@+}Boy7B}ntvf_m(wGl#&dzeJK>*CmA9GhG)vO#VI4d6xcO(MXBmb? z>nB;gQ~Kr)^COc`XZs+9xdY`-a#vkeI|UR{{bwY8mlrYo+~h;~ofD4U^wd>|<5;6D z;XAN(LbI?l+uF|WB2?EPE`G&R_W1|bk}|E2-TrY_s}MK$PV7CKyI;>)6b?3oWC(*O zchy8CH;gi9pQ@COn#A&egmH-NEdh*29b8rricHaFtrWFS{6UtLyPNw&m>$O^F#JhU zC;F4fxx2@@`Hxkn;YB{}*<*})CXR&7`#5|O25U5va}!52d9C_#H2PLnzZlg;de(+SUPF@pn8JqYJdg#Q0_CUf+ZSN(ei9+s;=&7t7i&X`;gCY~yo!zakq z&mvd6qH^~+U%XL22-&Z-R$}lL%eDfjT}vO&Jg-*znbv{rGWV_gx@Cj=Ck!p2OcC8v zYJ(g4=NbK~ykmJ+E4S65&OH8Q(- zn)*3$P6@#6v8Of&mg+hSPuEX#_doJVrzxei3kK;wo>GEbwEB!;h(@VSh>8DWUsE0G z8J@2U|GJ&Hr8!9D}S@Q;oJGbIB7H;(v{$@vk0}6 zpF#%x23DA%uK*+3i*kn3+~inM=i03K4HBc{8 z%+N70RE4e4iXNGd*C?u9BR-_FxpsihG=~iiBxho+lKD-dhrs2{3(~5ahNWXRTtbyAGk2AGGNHZEAV`f+NU-+uLImh$3u{-X*2%-nMS3W z2Eha)6s!2bV_ekkifp4Qo5Of#lHClQ5I>~8yq$W>VY}#k%YB-lULsB%P9_CY^BWqX zPhJ>uzB>t?AD>3jbE$>Arg-yVlbitx{`24FCmvygW9OQF*xGB#mOW9U^4MIDCc<4S z2sWhSy)``e_l+>7Gx?57`k!jYyH+4==zU$oe|gN`MWOgj1OLUryynVm(T>CWXaYzj z@>6Yp*9yLkAC=SkbvM_uv`?$2-~OncR$ZANbN_r=O#NtQ>wT8zopIJ=p{;1mj!dNE zlCHDe8`)1c&D6O+w*7cgb=`m3kKcCp))-dgz7f)KQAOm=cfi=y)eMiHpYD$^CRx>1 z`ib<{HzQ1$^T}@xUYp2ps}R~U@|bJ-*zB?D`Eg)>HGTb`(0uRXA>_cMGraHohYsi+ zJd>dJNQf~jq{1B?aw8*nz>gd5fM{d#*sg0wrH6i`5SsY#Wc7~gN6I}lOyL%Qry`GL zzr%yKuVy6*8()3S_&R?baf87Jzb)i2!05}yeoHCQ+@BjLbBf+Em^8>(nVT`vM{0F}d z=?@pR_vZ&d6?pLv|GErhZ#{4Pd2mjf;n8@3mGE?}2y~);Oj+lf!WR6PV9OU4&l{<- ze=Ldu-~b440LYmX05};CoD8H1&-ceW0LUh)3I-VW2@Hh*98raQJB3sbf3!m%@Mu(! zMTn4q0HToYpdSk-`+;uYXZNQg0HA7ylSF{?XId7cf@`Jn!=iG+!h;iq_J$n5l^@HH zor=N%L=k>S(~l=iD&bM6@Ho!z-SuPJ-#q;25b;1r=(&+VoG?JvZ&pISQWgJbyEp=n zLHO~5XAdA1;7^4R9)$xcgwlZA-%$j=&VMli&!AdB_q(bDWc_m?gh#Vu8tJFvfFAH~ zKjr*x_d6j!TMZ~1xYD4Q2a)X5y;c!lyUlytn19nSlt)UNOb$Ie{YH1B{! z+53LBntE>w0)<76XI1_uo{Xt$T$b@QjFw96iW_0yqP#MbiVA(vKQT=fS&}Z)CNdw( zaf%vgnrY2aW>T_OR|(p@gt*qA=A;hV^OT)@u7{*ez*KwL&x3m()xY5Hc|7$x z4viIa`Ea<8Itc&x(NW}$7Ee~!LsrHd*_{<5P7JlDQn=P1hmo(hu`Xb1w`O28q$Tkz z-8P;H?e%v!dG%B(UkPokoWP9^=UUQJ88c$NWzl8-GO{84CGzf)8wYOaV-8ZtixBeq zu}reV_Z9*2f_LJ3d+EmuvP`D-x+6?K$w}WC{vg#EllhiO7yC(k0dR;vO$RmPw5IK0 zkW*|`TlV6~jQqHH8pvm8GGL5z>JtugR6mRG3Xg0934<3>n1yzWU!3p;!eo*HZtJ9f zmI1uol5k;9>P?DJda=>zBTl}%9hQ zLuq`H2(7j*YLgBR+CJ}h2+0VG7_CWR0XTFxk>B1zE{ltlv8=r*V-TvOUGXLqH{qL= zQyGo)Wc0qsVm`d~2bss8ds}4gZ0+C*d(;H46HgAo6Yht4v)4kS7b|+4;Iki($EH zPbW31&(wa;mxA6BUi>Cr5nGq7Jf~hfxww~Vd6fx_xU$COu zW2Ax>f)U&Wqq1oxZQ5SBK)zARjQRZ7RmY*&L!(r!xv!D*|FK2Oxv(YBluQ;X+mflx27jI7zwT zRyhHyr6O$&j_*bbmF|7nU!Qo$pwM`Gu_G9hOu!NPu@h0ijVeBo`njn4-ryU)<5-iu zxuZSEJv_WC2Inmxmp0TUpu9mTU~uSM;Uz%ThEPQ)&B5v1g)4)ECS3*oM)bzipw^&m z49jH5YMX(Jn%Peh;DKw&Yg`P6y!7Ft|5pdz%XK^(#|B0ou1KQgO{TYo-4IBHoR9oE zB8r&X(?S$)Gc8?RyGfYenH|4ea3+d z3-C%!YH^rD_RCL;19YQzEEU;fj}9b)L|>QJ7D^CJ>c~b_s&;Az9bRkG+OkdzyWPfl zvqnu^5J4jRNmbrcfegOVXw*ssU_*dp`KwD3Q%{%4Rj2qo zILzBeai|E>BbM-jB-Ni+&!%)r#8t7N7&i7;+mkM*zd)S!=$amkL0}SaZ;VG$wX?~p zahGb9+G*lVP?GmU-%Jayz}8S#9mZAKTQIpQ*rQ@k_x4Fcae5KG<`~#>l{}?izrE}8 zi~+wU^@@nGt>B2wT$)yy4G<_8CncR91z#i<5~IP%qGudP6|lsD4VP6!iAiCDGa3-Y z!c2ath8zLUe5iZ(of3mI+@*R9NkoKM`h1OC=M(@aEo26B^O2* zo?@oNAv-|ml z=sqH0OY>-bB{AdWw?1Fy(Ose|NT0WguEOI+xBy>ixrIq>Yip;b`XG((I+;qPwH`*6 z;m)%RT%ZI#%7`tur+lHFlOe4mrU0J{YmGi9w`*hPjS zM5EFx#7PWvctzBWf*g^)RtgWQ{77TO$jCf;=o7+#$TiO+P`95e+>74+4oF5At)To8 z{Y0c+YDkG+zk-bqPKFco0c6!)B=Sgo#m27o!iwsf!1vH=6Br%*164(0}%s;-Zx9; z`M9H+zs8^%gVS%i3b|EEu1TsxL5=U%T|iGMnTA$B2p1l}w^3<7<{Zs${q9;0Ll&aC z;~YGjE)tiR;pVJUP+V4|L{R{<1FKuON;b=07*CjX^G=2Wn#269&#%_{UktN!V(0^= zKlBZ_hWH=_h+WN<5myMg!P5tP;)KCpd;6&k*0q`$q!&KD_e29$-ZErGhdbPZH^~CL zTO5NcixQX%Ru-IMxwKMTH;ozdFvVcb3;|IhR8!tlRF{a#U(o@(B5)iP-v>`LZmsSG zLs~hTt59RU#u`0efJ)?eryxl;V~gN2-k}&c(AAqfg<>7agYO1*~8p=g?V@|W;Yg(MO}Owr*S_D`zV5HtRUTFDHjv`+0>~{VAw+OP`tiYf%wWSfJElg!>;xau_FXBZP-;#Du5A6r z_%Y!6*2hfA0iv^*K`kvMO`Vi~H}by663^JLN0ILA?A3k41uV1pQ3AtJ$T}c9-f9l6I5< z9CtY{V91^cyf#V;m%*{@;S5b(K;`Y&b!InaJNhJs-p|4xm0uk@RX|=$UW50Hngt>r z5O;mOA&?vJ)$iCz8>pQB9ncU28UV>NP)m0Zm<4_%0yg%T9I$-^7Qj2Nd^U+C=iG5}jx+Dx2T)34>3TL`8LBqqOQa zpI<>0`ahwHAOigQ}s#Hepk52jKQ&%5$P?B+a3Ha(v zz^GGd3Y@EIf{3q9KLy`3DrRiul(i^@UtDcg)u)OV$I;k59>kxBkO7w@=z=7iG`5D? zY6naV`?D-CUA?cqSa>zhLZVEzdc|M;-Xj|!=d@=Sio4l)I|Cf3*8!c<0o|6N8fs5u zqz;m$Rr9!NK|$M19Mpd71bnq_ncI9+O5_7vQMob}*3=kZfBQGjI_xCpXyTQ}VLxLm z;g%?7Hl(Ayr*JkB?k|T~!n9?9^SHwqCU~t9e5hD~(A_QAhTRtJoUjdg+NvpSn5Ay^ zw3{OgFHjX}H21~xd~&Inz9BKv7uN!1W(J1v@ZE#+9~yTP*sv4O4(4E za7TT#UW;ny=Dug4&5*+7-<&yc`+%3fFDU-`uYWxPKH7&9dr_n-fTOev@$kl%HL^PA zR3KR`Z&z|nxV7Y3>0K%p%K`Rm!knrPzQd^k^#Mj<&#RH8NGxQywAKI=ZRnq+6?^;G zp4NQ!=ZdHjkIs_wGcr|g4$d)O@5!?; zNzCZ_H0a{;IxXkLQxOmS;JWTgB6|bBx7uK4H2s;^iYtUctN=BxuBj`|#4>!{65e3o z)mtEqu1D9XRr)9oJeV|I#mS0QP^;n_ITrw^d^8Qo7oIO6YCa;=I?9;bP{lz{)F0DE z?G1Y^T+Zv$VY`rLynCZlGGM}RnHg!aCwh$`Ka7?;8~g`usIiaJvzSQgzNn12uM#0r0!(S!y#v)}+TDxFs~lfk+2B~6jiVHKP&93LtxoDgd^kkb{rn716mM-g1+ecmq_{yc)|79pm=Z zRKc@WZcK$?`RHR~G{MK$G$j1#we~gJt4*eJ=!jV!7nkG5W5##X6%W&PeDY3;ORiv&_fw`&3)6C zte}5JnffD#rRoDFsj^y`X*ED@ssHBqKDlo;|M5z@ zD85q}QsH7k1@|%dRuGJm~xBTS@d!rSMW9~ac=hv7(Bv9$_~6Vq^6ZXaDD8H_i#IrwrMg}QSyr< zLT*K3@$|o70XTRiuTq)btP?j}V96~XVy>E9CXz%Id*p93{AQW9ENN~C%p>8i-933K zwotu+8KWTG*DO+Vu0B~MogC-p66zJ8b-)wR44d7Q_V}%VWYBj-%;13=m0ZehehGCJ zpd)n#g3C}Fs;u=}%M3HLx&I)Y_5+%>X`zF!|VeFR~|8|3joVF zh%3fEBIMW)bo({aB4U~c6RNT0s|s7-+qnykhG>H@x5W9m_c?eb%?lE`<` zU*}}7ByekMgi#*AtX8mXNQ`;$TycYzf}k?yc!@+b)*SFIomWxao^RRQc@z2|4yQJE z17dpf-Q0qeHw1=LhcB?j5c3`fji;!&y@Lr{e>uYKRDOG*;FoR5Qaafr^~1C_GVoFM zzE$%Ugoc;hg|t?lumuarXRoORadT8}T@RpLQ#cXC%S;zFU z4mw{s*qJpeA+T&?lVKOF1K)?4*!080Mi`I*SxjG%$2_vYIuc4?v6ZFE){c1;R&jL{ z01{;>kJ>idfiHfAR&rn2&h=3!l3dyQl0fsWTND!{ttNGjY8)O{mT1~u@IGhXZ(nP!}$+|s1q zIzClIOl<9=Lams#AgtX((>K5l1rDXLth9}Gn}+^=z*D9^%GOfLXi|GbWSL33S}H8C zte%R_q4t~a0BNA7xYh2efkc$lB`VcXCTH9)=SJlh`|RYSx{X95bWzKmYe|h``qE&g zACv&IWi&zdfO5t#eI-Mr6|l99TQV^8yC^YKWtyG;iDFGF_WG z`qg2FI<^9>Ob3^+DR7S*#Yqjwq@IHZ=Uyg0y<9nAkKttX5$3hpElp#4vQqSr6Q4V` zCIuS)^tXKAK7cOBzON<&PqUFyE8&9HB^o)gehN9#k|K3O(V={17cnA&TG+&eWW$f> zcidEDzP6W5WD|rkU5lIEU^xMQRWY?bL;S7r%Um{hSW1LKN<< z>=?_2DT{kBx2@+2x)vbMBD~t@Eer;zio#V~#j+Dbws%#Eis<43jl^kEWbF zGApDb6YO|y4^0sKo^yXbOhh(`RK5CHfV&7H&k;BksOKM-K3_ge{$;|Ua|)FoD1iTL z>ePX58|xUs%a>vQ`ri!nqCk78``3kWR0-8(h{1D%|Jp%Kz98DTd$PHy0u-s22(~YEV1R%_EnSI3mX1xo>HmD9M@^;1CMkggAv>Rqh=%>5?b+OPVBL zP#bu=06JE`25jBfVll;3(=HzFd(eTjax5bCn1=VO0PWR(KohdQR1n@Je%H5{x%U#T zVKK=ToF=)euVT~|MMUw+sFaFZ&A-M1#)GWjy!oP?wUH`k(!UkyRpC+T$-CRzByNbJhy*X#encVH#AG?ZFqd6asOJKxD#alGcf;xporhFXSo-4=#<^mhh&xOYnD z;~j!r%Gpt#j63u`?hw=_2$;RezcRC!{=a93O82$IeR6z!HGNxleX(RI-IhlhD0`FW zNenUDa+|HrMTDF$FO`-DPuVHTv9o$@lXOifnj6ERBg*yxKxHxTPGB=itY!|RI(Bq` zg>90m$QA)JsM;-VN$SDVhF8PAqZs*8D13lwJ@y7=pUFcXBP3K@*7>h-PnN?PUy78h}VFDR^!*wA+Jhe7>N^lJh~W9z$-Ca;f%}-513zI zv+P*K$S(pU)HE-qcaApkY0x-6$?w8mv}10h9_MRxKo2d(q3scr>R6@LtR?Ns4ef|C zyD1V_bj!$qa^yP6=EpMR zafXXb3s^dO*`_`3b4G1!Md{hLJYGH_LDjrG?o{s9O+26Pguun2r<6N@xJf6kPKQ?* zv!C`NJ1bvp!-{0}o`rvBT^DxNlpueF%uvq*mft8~pYv`t86qA6YAMFKs!9s<@>E;e zBRv$awBFMFe{gmIwWCti6k}6R85^D-Wr>eS&mQ#74A>>zqKgu<)-;xn`hRHlhX4*! zy7@1nkSiuV-AOc~6FXwRBS7GXf{l52#M3^Aij)1SH~0R0Fcv+zV~!#P?tFBI0^4>r z3%NoH|Hc8+5U7}j#KTSpOyaq{t2%*7H@ql(aD-ZW z54SiRqc*{#X5%zt8D@v=CKijn7;n~eN5B`O_oR3#j{k;L!}npC#kvbp0s(IxDdgOi zx2G^f89MNhE`lYuKR2^uQm>e+*}5<=(-+Nsh5;`|^O>6N!Cf>>DeW@6?V9bk@4-Ei z>?p&KQ~TiwcIp+FsY>qU{nU(kD~~KdZ73ag{^G`1Cbp0I_H!}_?&DY&%OFw(`rCVO zf58UPyniuR!LNoX#%!$=x^*C%Q$&U5M}SecGPoik8eVec`m^>i)x>p4AvQ<_NfhB{ zoW1$$Q;>_6U)iL8_spy2Sm4HM`O$lazq8lU6qYzR8EWGhBL#^406wYy7*X@KJMj+F z_PoQf$%)}&y=-NuNcGo}OHf}g;yiVB7IFwf;|ZwOSJpAIOcdbC@N4J*jSeovGBEyb z2_;_;vC`qqBL9O~U#m!f%dq<+2dA;vI{z#g|HAvMXR+DGW28=$j>hf_vG-r#-+pmf ze+@;5+L_ewB}kQ`uTVh4AQ-1lap9O8f2ACU6Ae+7(Yk(WRqIzQYhXBLDl;ao3U8Jk2O)1QE%x{%m z|L8OF}Jbh#axuVl@AuZovtW;+{oN%KVC9Q zU7ZnG4*b^H%s=?}LQg1`XZ(VEw5v4|-U0vP^d%%-qD*v-zEkt^ji~DGXNDa+@SU(2 zDH)k*4fPTk;|}C6URAeY>0+MOmT!Fw6D+V6w78`s>;3%m?D+IR$XR)1Ol)2yyWat4 zk!x}=%Vt5$UrT*M`OW3?{J|L+dY<-!q8O{=UPCuEc_<9Ip=+SXeGyAx^p2m6lCK@< z7k9;rWYQgTd*d}6s(15Y!)KKekdwWu;2k%eE+C{cbjMJWa0dLawC5PK(0~ED$vFqL z_RFkn{|dOU9M1?e;|xp1j?7fD?POu5T*1fK{FT!6^{5M+FeR+ESQ)8>`2nxR3h=Sp9}*$QRPPbo=>-2hNT=iWj{&MvZ9xM zx>C<0myd`)Z{4%&(IZEQEy~2OKkbZrA-@Joi(DY8q|O=BL`Hxdjeo%r!?s!S)83p^ zwG8tO)FRIO_q(ou7I>f9fNxoFxtJoQ7vvG$ncss)-B$f&`h}F;|8FxW-vQPq;lm5U zhkSSL6Ff;n10#e4dmWR&oXw8lxTpqIhG>+n=NCZqC(L#CU|J8vvEI%&`U&WZFM$5V zB2f<8p1_b zTK_+g@cO~4`CT1&(O8R~wPp43ykHBO|2Z#gS_b4Upi8(zkJ=rEfsh-xnj$Q|UOvY; zXg=$;@MX>rX;ZUv{D?TkcZJ{9WK zdZg!G35G7u{x;bjcgn~T?lexncVtJvp&|c#U=x^uM=xF!FZHbjlR{2#Q6fc%v4t`y zNq139Lok`gVP;!3i&{g1XLqvFk7s{zCAS<;Hn;um&QC%41;p^Yxcm;b^NXJ6A9$GW zFL;|@E_A)%?dJa)WJh zR%jjstU)e*cQFdT&>Rd4nJ>0h)@fLH4lARHTnNHH5fn#M=rJ!^E%v|ij|tPm&AEFF z^D_&N99tS57JT_M_=`um4eNRCj7sT#G?ROh;O_AFJAmUvxF6t_re68?a3?dV7jz$c zv)&`^5$biLl{xm1%T;Q?UKCQfFjzhcR-(j^8u5AvtvVw62({aKr+!GqB{JxR7+?=C zF&{Cy?*5HE$5Zx(I_}QXen@n0`WK?oU(?sHIGf~_uxXJQJ@Ya zEI=lPW&GN=TZ4eLI@81F#k5z@fyUvIk!9r2vcPjRxtP|3dbH4u`u-Ne{gbLv+fHq5 zyy0pA@tuHcf4#nvs1IDvMf(*9e&LuKK;X2J#k8!x9~>8`C`*^XvWt(cQRcI#u!Q8V zsdUv1<%%Xf9{IT2?J)OZm3~7tChuif8j!r01{_!7S-hyr@lNQPUKx&%h&%O%*iQu$ zBS??k&zrb~WV#Fn76@GT&kJBKhW;>IgO$aYNffK78h(n1W=spncEW@Ab1*FA+_0E%=c|NnHx^Lf6r;CuT^f33O%X8hrc1vlVR$>%r&RPtCt_|hqixCEyn zHxO_oWO(n{pVizzBQjW_iv=vU`lTWcR$~qYh4bP8i$5ts_*+5gpwxdy{oLURYXX^@ z*@#nO&-?0mz0U8|tH6rUVL5-*b~J%q6+EY+uN}Q&1$l9D_#UVs#agY71x}a1#f4-~ z10EK~g<-B=Ym^kyWU?$T!V}qnRncIMVN>;LHfuVTCb?vgmuZDkP&R=+$sDTRe0NE| z>AZ%Q;g|(~xgP2{}O*d-{u4h2&Zg*fo?s@qh1nN)9 z)mI3)o}kF+&CM-2P&IMtz{ftg+U zO!Ud)l~2!Mm4sT08&0K2q}qtI$Z^LsYz$AfV3qZFj(!6|35aC2_nRsBR4a#Hh#Tns z#j30CfX^o=AKlmZ@|+Tfs=y!cA3vW>a!M`eci%4gPoqP(Ruq?lcVU1pNVX0z%-q>Z zvJIc>U)CgGTn~RKt8d7!Ab|&+Cm(gu$DqVUPO(b{*SeLbzAEauf)r+yr%tK~sMfCW zF;EpP2yfp7guR@I+zCb?D2%zA*nNQNw_@%Aa9yhbYJ_HqOT%Q7j<#kJ0hNO{hot&kxuT=@1R_##8ah=`w%*Zh z9+}sz^iRO6Rkke+t033Y2K1l`hL9lZn6`$yn-ug;)mTQ+;2Bi#yl@0c@>5JfkunBVvm9B5YI3z?8B;1S z*lpw8d9tPXoMJ5e4%oOU%m0{s^*dnoZm;fWMoYh6v1#hsa>ZHHoBW87Nr=^}d{&+X*I)jIwTwIZ z{;sj3l=COyYIZLyyLT33YhUSP%*9b@6BrYS`Uv}^-zi1m$v62jc{-wNi>S{4y`q<| z$9}UK5h?vDy$?C7tL7c$xEJl$!8~u>z3BD7ze*?FS0(Xt5N6J}y66$LLkE-kriWh0 zV7Ga1JXRm%jX^<}L1Fei #include #include "testing_helpers.hpp" - +#define maxlength (1 << 25) +int a[maxlength], b[maxlength], c[maxlength]; int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; - + int SIZE = 1 << 8; + int NPOT = SIZE - 3; + +for (int SIZE = 1 << 8; SIZE < maxlength; SIZE <<= 1) +{ + NPOT = SIZE - 3; // Scan tests printf("\n"); @@ -120,6 +123,7 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); +} system("pause"); } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index aaf0ac8..d793ff2 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,6 +1,8 @@ #include #include "stream_compaction\common.h" #include "cpu.h" +#include +#include namespace StreamCompaction { namespace CPU { @@ -108,7 +110,15 @@ int compactWithScan(int n, int *odata, const int *idata) { for (int i = 0; i < n; i++) odata[i] = idata[i] ? 1 : 0; + auto start = std::chrono::system_clock::now(); scan(n, odata, odata); + auto end = std::chrono::system_clock::now(); + //auto duration = std::chrono::duration_cast(end - start); + std::chrono::duration duration = end - start; + std::cout << duration.count() << std::endl; + //FILE* fp = fopen("efficient.txt", "a+"); + //fprintf(fp, "%d %f\n", ilog2ceil(n), duration.count()); + //fclose(fp); for (int i = 0; i < n - 1; i++) { if (odata[i] != odata[i + 1]) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 89d78bd..3e2eb31 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -18,17 +18,23 @@ int *dev_ScanResult; int *dev_OutputData; int *dev_total; -//__global__ void CudaUpSweep(int d, int *data) +int threadPerBlock = 1024; +int BlockNum; +//__global__ void CudaUpSweep(int d, int *data, int addTimes) //{ -// int thid = threadIdx.x; +// int thid = (blockIdx.x * blockDim.x) + threadIdx.x; +// if (thid >= addTimes) +// return; // int m = 1 << (d + 1); // if (!(thid % m)) // data[thid + m - 1] += data[thid + (m >> 1) - 1]; //} // -//__global__ void CudaDownSweep(int d, int *data) +//__global__ void CudaDownSweep(int d, int *data, int addTimes) //{ -// int thid = threadIdx.x; +// int thid = (blockIdx.x * blockDim.x) + threadIdx.x; +// if (thid >= addTimes) +// return; // int m = 1 << (d + 1); // if (!(thid % m)) // { @@ -37,15 +43,19 @@ int *dev_total; // data[thid + m - 1] += temp; // } //} -__global__ void CudaUpSweep(int d, int *data) +__global__ void CudaUpSweep(int d, int *data, int addTimes) { - int thid = threadIdx.x; + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; } -__global__ void CudaDownSweep(int d, int *data) +__global__ void CudaDownSweep(int d, int *data, int addTimes) { - int thid = threadIdx.x; + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; int m = (thid + 1) * (1 << (d + 1)); int temp = data[m - 1 - (1 << d)]; data[m - 1 - (1 << d)] = data[m - 1]; @@ -58,7 +68,7 @@ void scan(int n, int *odata, const int *idata) { //int odata[8]; int nCeilLog = ilog2ceil(n); int nLength = 1 << nCeilLog; - + cudaMalloc((void**)&dev_Data, nLength * sizeof(int)); checkCUDAError("cudaMalloc failed!"); @@ -68,7 +78,8 @@ void scan(int n, int *odata, const int *idata) { for (int i = 0; i < nCeilLog; i++) { int addTimes = 1 << (nCeilLog - 1 - i); - CudaUpSweep<<<1, addTimes>>>(i, dev_Data); + BlockNum = addTimes / threadPerBlock + 1; + CudaUpSweep<<>>(i, dev_Data, addTimes); } cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); @@ -76,8 +87,22 @@ void scan(int n, int *odata, const int *idata) { for (int i = nCeilLog - 1; i >= 0; i--) { int addTimes = 1 << (nCeilLog - 1 - i); - CudaDownSweep<<<1, addTimes>>>(i, dev_Data); + BlockNum = addTimes / threadPerBlock + 1; + CudaDownSweep<<>>(i, dev_Data, addTimes); } + //for (int i = 0; i < nCeilLog; i++) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaUpSweep<<>>(i, dev_Data, nLength); + //} + + //cudaMemset(dev_Data + nLength - 1, 0, sizeof(int)); + //checkCUDAError("cudaMemset failed!"); + //for (int i = nCeilLog - 1; i >= 0; i--) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaDownSweep<<>>(i, dev_Data, nLength); + //} cudaMemcpy(odata, dev_Data, sizeof(int) * n, cudaMemcpyDeviceToHost); checkCUDAError("cudaMemcpy to host failed!"); @@ -87,23 +112,28 @@ void scan(int n, int *odata, const int *idata) { cudaFree(dev_Data); } -__global__ void CudaGetFlag(int *out, int *in) +__global__ void CudaGetFlag(int *out, int *in, int n) { - int thid = threadIdx.x; + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; out[thid] = in[thid] ? 1 : 0; } -__global__ void CudaGetResult(int *result, int *flag, int *scanResult, int *data) +__global__ void CudaGetResult(int *result, int *flag, int *scanResult, int *data, int n) { - int thid = threadIdx.x; + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; if (flag[thid]) result[scanResult[thid]] = data[thid]; } -__global__ void CudaGetTotal(int *total, int *flag) +__global__ void CudaGetTotal(int *total, int *flag, int n) { - int thid = threadIdx.x; - + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; if (flag[thid]) { total[0] = total[0] + total[1]; @@ -150,25 +180,60 @@ int compact(int n, int *odata, const int *idata) { checkCUDAError("cudaMemcpy to device failed!"); // dev_Flag is 0 or 1, calculate dev_Flag - CudaGetFlag<<<1, nLength>>>(dev_Flag, dev_Data); + BlockNum = nLength / threadPerBlock + 1; + CudaGetFlag<<>>(dev_Flag, dev_Data, nLength); + + + + + // now scan cudaMemcpy(dev_ScanResult, dev_Flag, nLength * sizeof(int), cudaMemcpyDeviceToDevice); checkCUDAError("cudaMemcpy device to device failed!"); + +float time_elapsed=0; +cudaEvent_t start,stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +cudaEventRecord( start,0); + for (int i = 0; i < nCeilLog; i++) { int addTimes = 1 << (nCeilLog - 1 - i); - CudaUpSweep<<<1, addTimes>>>(i, dev_ScanResult); + BlockNum = addTimes / threadPerBlock + 1; + CudaUpSweep<<>>(i, dev_ScanResult, addTimes); } cudaMemset(dev_ScanResult + nLength - 1, 0, sizeof(int)); checkCUDAError("cudaMemcpy to device failed!"); for (int i = nCeilLog - 1; i >= 0; i--) { int addTimes = 1 << (nCeilLog - 1 - i); - CudaDownSweep<<<1, addTimes>>>(i, dev_ScanResult); + BlockNum = addTimes / threadPerBlock + 1; + CudaDownSweep<<>>(i, dev_ScanResult, addTimes); } - - CudaGetResult<<<1, n>>>(dev_OutputData, dev_Flag, dev_ScanResult, dev_Data); + //for (int i = 0; i < nCeilLog; i++) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaUpSweep<<>>(i, dev_ScanResult, nLength); + //} + //cudaMemset(dev_ScanResult + nLength - 1, 0, sizeof(int)); + //checkCUDAError("cudaMemcpy to device failed!"); + //for (int i = nCeilLog - 1; i >= 0; i--) + //{ + // BlockNum = nLength / threadPerBlock + 1; + // CudaDownSweep<<>>(i, dev_ScanResult, nLength); + //} + cudaEventRecord( stop,0); +cudaEventSynchronize(start); +cudaEventSynchronize(stop); +cudaEventElapsedTime(&time_elapsed,start,stop); +//FILE* fp = fopen("efficient.txt", "a+"); +//fprintf(fp, "%d %f\n", nCeilLog, time_elapsed); +//fclose(fp); + + BlockNum = n / threadPerBlock + 1; + CudaGetResult<<>>(dev_OutputData, dev_Flag, dev_ScanResult, dev_Data, n); cudaMemcpy(odata, dev_OutputData, sizeof(int) * n, cudaMemcpyDeviceToHost); checkCUDAError("cudaMemcpy to host failed!"); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index f3009ba..ca7b617 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,12 +11,16 @@ namespace Naive { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ +int threadPerBlock = 1024; +int BlockNum; int *dev_Data[2]; -__global__ void CudaScan(int d, int *in, int *out) +__global__ void CudaScan(int d, int *in, int *out, int n) { - int thid = threadIdx.x; + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; int m = 1 << (d - 1); if (thid >= m) @@ -39,11 +43,24 @@ void scan(int n, int *odata, const int *idata) { checkCUDAError("cudaMemcpy to device failed!"); int nOutputIndex = 0; + float time_elapsed=0; +cudaEvent_t start,stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +cudaEventRecord( start,0); for (int i = 1; i <= nCeilLog; i++) { nOutputIndex ^= 1; - CudaScan<<<1, nLength>>>(i, dev_Data[nOutputIndex ^ 1], dev_Data[nOutputIndex]); + BlockNum = nLength / threadPerBlock + 1; + CudaScan<<>>(i, dev_Data[nOutputIndex ^ 1], dev_Data[nOutputIndex], nLength); } + cudaEventRecord( stop,0); +cudaEventSynchronize(start); +cudaEventSynchronize(stop); +cudaEventElapsedTime(&time_elapsed,start,stop); +//FILE* fp = fopen("efficient.txt", "a+"); +//fprintf(fp, "%d %f\n", nCeilLog, time_elapsed); +//fclose(fp); odata[0] = 0; cudaMemcpy(odata + 1, dev_Data[nOutputIndex], sizeof(int) * (n - 1), cudaMemcpyDeviceToHost); checkCUDAError("cudaMemcpy to host failed!"); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 4f20816..da69007 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,7 @@ #include #include "common.h" #include "thrust.h" +#include namespace StreamCompaction { namespace Thrust { @@ -18,7 +19,15 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); +auto start = std::chrono::system_clock::now(); thrust::exclusive_scan(idata, idata + n, odata); + auto end = std::chrono::system_clock::now(); + //auto duration = std::chrono::duration_cast(end - start); + auto duration = std::chrono::duration_cast(end-start).count(); + std::cout << duration << std::endl; + FILE* fp = fopen("efficient.txt", "a+"); + fprintf(fp, "%d %I64d\n", ilog2ceil(n), duration); + fclose(fp); } } From c6852e86edf77c825298c6128c00986abdc68aba Mon Sep 17 00:00:00 2001 From: immiao Date: Tue, 27 Sep 2016 13:35:33 -0400 Subject: [PATCH 7/8] Add radix sort --- README.md | 348 ++++++++++++++++++++++++++++++- img/table.jpg | Bin 0 -> 20676 bytes src/main.cpp | 196 +++++++++-------- src/testing_helpers.hpp | 4 +- stream_compaction/CMakeLists.txt | 2 + stream_compaction/cpu.cu | 2 +- stream_compaction/efficient.cu | 2 +- stream_compaction/radix.cu | 184 ++++++++++++++++ stream_compaction/radix.h | 7 + stream_compaction/thrust.cu | 14 +- 10 files changed, 651 insertions(+), 108 deletions(-) create mode 100644 img/table.jpg create mode 100644 stream_compaction/radix.cu create mode 100644 stream_compaction/radix.h diff --git a/README.md b/README.md index bff33ff..49f1dac 100644 --- a/README.md +++ b/README.md @@ -8,27 +8,359 @@ CUDA Stream Compaction ## Screenshot -___ - All tests passed. + ![](./img/passed.jpg) ## Performance Analysis -___ ### Scan Comparison -| length of input| CPU Scan (ms) | Naïve Scan (ms) | non-optimized efficient Scan (ms) | optimized efficient scan (ms) |thrust scan (ms)| +| length of input| cpu scan (ms) | naive scan (ms) | non-optimized efficient scan (ms) | optimized efficient scan (ms) |thrust scan (ms)| |----------------|---------------|-----------------|-----------------------------------|-------------------------------|----------------| | 2^10 | 0 | 0.0418 | 0.0882 | 0.0954 |0 | | 2^13 | 0 | 0.0761 | 0.1681 | 0.1473 |0 | | 2^16 | 0 | 0.4851 | 0.9168 | 0.3155 |0 | | 2^19 | 2.0001 | 3.5169 | 7.7984 | 1.6069 |1 | -| 2^22 | 33.0019 | 31.4348 | 61.9076 | 11.3602 |5.003 | +| 2^22 | 33.0019 | 31.4348 | 61.9076 | 11.3602 |3.002 | + +**non-optimized efficient scan:** + + __global__ void CudaUpSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = 1 << (d + 1); + if (!(thid % m)) + data[thid + m - 1] += data[thid + (m >> 1) - 1]; + } + + __global__ void CudaDownSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = 1 << (d + 1); + if (!(thid % m)) + { + int temp = data[thid + (m >> 1) - 1]; + data[thid + (m >> 1) - 1] = data[thid + m - 1]; + data[thid + m - 1] += temp; + } + } + +**optimized efficient scan:** + + __global__ void CudaUpSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; + } + + __global__ void CudaDownSweep(int d, int *data, int addTimes) + { + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = (thid + 1) * (1 << (d + 1)); + int temp = data[m - 1 - (1 << d)]; + data[m - 1 - (1 << d)] = data[m - 1]; + data[m - 1] += temp; + } + +### Block Size Comparison + +Comparing blocksize = 1024, 512, 256, 128, 64, it seems that *128 <= blocksize <= 512* performs better on my machine. + +![](./img/table.jpg) + +## Question + +- **Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.** + +See Performance Analysis. + +- **Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis).** + +See Performance Analysis. + +- **Write a brief explanation of the phenomena you see here.** + +The problem of non-optimized efficient scan is that lots of threads are doing nothing because we're examining if the index is divisible in the thread. The solution is to eliminate these threads. Actually we're able to access the useful indices of data directly by deducing some formulas using the index of thread. + +- **Paste the output of the test program into a triple-backtick block in your README.** + +The output of the test program: + + **************** + ** SCAN TESTS ** + **************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] + ==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 41108905 + 6 ] + ==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 41108897 + 4 ] + passed + ==== naive scan, power-of-two ==== + passed + ==== naive scan, non-power-of-two ==== + passed + ==== work-efficient scan, power-of-two ==== + passed + ==== work-efficient scan, non-power-of-two ==== + passed + ==== thrust scan, power-of-two ==== + passed + ==== thrust scan, non-power-of-two ==== + passed + + ***************************** + ** STREAM COMPACTION TESTS ** + ***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] + ==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + ==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + ==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + ==== work-efficient compact, power-of-two ==== + passed + ==== work-efficient compact, non-power-of-two ==== + passed + +## Extra Credits + +### Optimized efficient scan + +See performance analysis. + +### Radix sort -The performance of **Naive Boids**, **Uniform Boids** and **Coherent Uniform Boids** is measured by FPS. Different amounts of boids are considered. The results are as below. +Implemented in radix.h and radix.cu. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 256 + Time:0.893440 ms + Unsorted Array: + [ 129 65 3 2 164 225 233 65 163 238 5 32 39 ... 194 0 ] + Sorted Array: + [ 0 0 0 0 1 1 2 3 3 5 5 5 6 ... 253 254 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 512 + Time:1.707232 ms + Unsorted Array: + [ 359 307 257 255 382 173 233 269 365 470 220 4 277 ... 263 0 ] + Sorted Array: + [ 0 1 1 1 3 3 4 4 5 5 6 6 7 ... 510 510 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 1024 + Time:2.026848 ms + Unsorted Array: + [ 858 812 768 254 875 147 233 755 850 970 200 1013 780 ... 280 0 ] + Sorted Array: + [ 0 0 0 3 4 4 4 5 5 6 8 10 11 ... 1021 1022 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 2048 + Time:1.603968 ms + Unsorted Array: + [ 852 809 768 1277 866 134 233 1766 837 1988 190 2030 776 ... 6 0 ] + Sorted Array: + [ 0 0 0 2 3 6 6 6 8 8 9 10 12 ... 2046 2046 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 4096 + Time:2.233344 ms + Unsorted Array: + [ 849 2855 768 1277 2909 2175 233 1760 2878 4033 185 2027 774 ... 432 0 ] + Sorted Array: + [ 0 0 0 1 2 3 6 7 8 9 9 9 11 ... 4093 4094 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 8192 + Time:3.248928 ms + Unsorted Array: + [ 4943 6950 768 1277 2907 2172 233 1757 2875 4032 4278 6121 773 ... 5148 0 + ] + Sorted Array: + [ 0 0 0 1 1 3 3 5 5 7 7 7 8 ... 8189 8189 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 16384 + Time:3.312192 ms + Unsorted Array: + [ 13134 6950 768 1277 2906 10362 233 9947 11065 12223 4277 14312 8964 ... 53 + 62 0 ] + Sorted Array: + [ 0 1 1 1 1 2 3 4 5 5 6 6 7 ... 16382 16382 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 32768 + Time:4.561664 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 18084 0 ] + Sorted Array: + [ 0 0 0 1 1 1 2 4 5 7 7 7 8 ... 32764 32765 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 65536 + Time:6.343616 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 27145 0 ] + Sorted Array: + [ 0 0 1 1 1 1 1 1 2 2 2 4 4 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 131072 + Time:10.497824 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 12498 0 ] + Sorted Array: + [ 0 0 1 1 1 1 1 1 1 1 1 1 2 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 262144 + Time:17.427200 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 15972 0 ] + Sorted Array: + [ 0 0 0 0 0 1 1 1 1 1 1 1 1 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 524288 + Time:32.032894 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 22920 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 1048576 + Time:61.234879 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 4048 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 2097152 + Time:122.582626 ms + Unsorted Array: + [ 13134 6950 768 1277 19289 26745 233 26330 27448 12223 20660 14312 8964 ... + 31840 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 4194304 + Time:232.911423 ms + Unsorted Array: + [ 13138 17699 18632 25340 29604 30672 3346 23402 11192 6075 20512 14356 3110 + 2 ... 20531 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 8388608 + Time:438.904877 ms + Unsorted Array: + [ 13138 17699 18632 25340 29604 30672 3346 23402 11192 6075 20512 14356 3110 + 2 ... 17267 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed + + **************** + ** RADIX SORT TESTS ** + **************** + ==== Radix Sort ==== + Array Size = 16777216 + Time:875.574402 ms + Unsorted Array: + [ 13144 6428 21592 7931 17465 5758 9571 17548 11447 26547 20216 14444 9844 . + .. 9561 0 ] + Sorted Array: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed diff --git a/img/table.jpg b/img/table.jpg new file mode 100644 index 0000000000000000000000000000000000000000..91ee62de2162077cb679f95cac4d4a6bfde776ae GIT binary patch literal 20676 zcmch<1z6PE_CGvBcMjd1(k=iYPf zJ@@xK?>n=fFT3}O&suxWo;BP}-z@?#9!bkd1AssvzzFgN+|2?c00?k!@NlpQ@bK`6 zhzLk1AXF4&WE27{Y;+I_AsHzNAu%yIH47a%B{LNGaEY>7Z({FzYrgXAPXlK z$NeNgL_|asWE6ZGfW`m-fH&WM41_0`KJi9PHF0gI5qPXxk{O*6 zorq%-_+=`Cx1i~;z!^?@ti zR3{S$v3Tgiu~Z^9KHk0-o+I1nRH3IieJeLGZ!*t>FfT68LpQu{*f-V|sxIc|Z(@hs zE(1?yy-hnVE`h5zXEN~x`641?JF$aX2k3Nf3w@m1$H~$T&O$RbS1KV33Fb_Ak~YYU zCNNO`l0=YYb0dCD%3Y&>>OlEY6adUj-TX;vT7ut?9F*0gpH6`;k0(A9}pqF51{WnX!I-;r-!l=K>*VrUa+77r|EMPihg1=zmg6 zn+pa2plPj#hDOwfCi;?Clv%Cb$jFwS3K8e$*Kl&3TyY7$dCbu`oP?OG0SO8W`(UR(ee0bjl21&dR)0yDVbZYku%Yqt`CRpEvqj;g{)L+Ey>f!3Vhl=Ayghljo@M=&COp3t z3z6#cGfaK>FuT60HxHy_m5XHI|)IVIG&&bvo|3jxz11N3;6AmQ07xUqN&6kqG+=k{e zUy%%Mp>kn6_l1+tYDUki^Ngd1FYFuJd}YMfPIl)l{?P5cXpkZSSn5J3r$eF;njx=? zp~f*mAARM`g6@nVC6CIr7wk`-t_^IT4t{GDuC@-$ahF5t006mZesl*Q7J=d6J&s=( z^>5r`O=s!tU7HfRV=Teg!F~5`wlJdr-gBI2bbL)to*z` z(D{o%eqa=j3jil@ct+}zNmiWFJvznPJ#^YcmcB5j^zBB!uUwJWiaS~EyZt~k_!Evl zarsCQ>rZw>p=t!cw|kcF^^@f9IQ*Xf-K1c~hXJJaxu2+@3SEs}PeF8n!N>|ft+ z`YzS~tukF!cuGVRfac@BekuB6J?KxTA0SoVU%>8@mLEC5N7WT?vY*xWRWiKV40z8? z>ugrGVqRvtc9fZlQO-Ec|NDOE{rp=KEI*=jRa3P*O+*x+{4XYKACeo zibKFC{_GNNX2MQwQdP~qTFr!)I+(ye(wp!;+pO-fKC=x!K`V`P0kb#e zt`?7rk7RXYf@0ie$ZE;D945s=>FfBC$;TtwwA5*q9`WUF;mmec@5^wyjWmb4l6{d@=bpz&proho*X+LCTr zcJ%Sh;}zNTTblIdXIAEE<#jsduYO5sMSm*3(s^3^FExgT^+SbiFSKr6D9^4h2xniM zJvlogN+Qbt_H_TV-m-PmN9O8ZQkCoEFQxmI_d9iesv*7!?_+-FA=~F;d$P^?Mzh^e zOcRrTj{J+uq?``2(HokJH#`?-ctgbt6(`M0Y za7ki?K?b~8^aq!SN$xkXvx^&5Az$r@HpUHwOMKe9E{F*2(oFbtFs^I9R5Atk=C8C# zSrZP0?j82mS?v7}!f>uD;7W)JwCYQYcFswVT%Q^DKhSz>3{i&3i*__`Drwn1Vbq`E zNm#YhTG_5%{eZFhN<)r_keB2@>;0rJ{NA_tCQSC(FJIYKpB}ZnkUH6}-#PRVr#x&D zm&^Vj!#tX!*Bi3Zy4^c7w0zS}fK&ux$TlmswB-9|!t0}xoiZCkd+t%{^0t4H$rKBb zv^4gf!taT0_K3+~nf^=9^-p-1CB@U7=W~VP#PlXeDwaAMJ&%AaynfS;RgzAO{yf@w zy4#yK`ko~UYjxIs^o_!6y-;@D7ycG()%@8 znm+%wHl)OVrBsCanU#-pg`%<(hCo*)yKx{uCMufhPheE8U$!qrh&KHA{4#Rmf_qpm zC2lQs160mewJgvp)t@Bwe2@fW$9@zqE!lgcJbOf~+Rwow&6rIbHO~tHy>IZ?yY7rZ zo>f5?k6mm=YCKw39D}8*w;CbZlqTV^ghcAnlvF~>vr8|tUWop+P^(6%&7F}};B z#>U3}gm-7#X8|Px6~_!)fAXhDO`0tl*|WwOBaI@=%l9NM&z4D&UcbuI5)KmB@99?iyAwm&msq zdrz&PymW7BZ3TNr)BD-d2NBHhrQtWiT-ca}4qPkZzj%`W?rE5-gcp8uCHEwx_TFpt zhbvooe=8w1)pGD~@Z_%^G9(+G^X?gc^;nOQ{JT*tl7{K?QA!pQFC=@TRbyr9$6QKH z6SVkm(_cr~3tHm!o5$l9u3r{b$K}UgI@>$XOt}ibj##fcl>Mgu;eu+)nk;U5dHZwi zXM`zQ$%x7Ri3q(U*`z@;+g=%dj`TXqNv`b^+-$RaFCPy`t`wt3#-D6AiI&AsQok}2f*n0aJ>HOgkBQTMCJ;Y-R!y4zW8 z4PIW=<7)0T4vzG;NOTMbzw6C$4_%MeZawr}fq?FqXdaulv#1>0MtZIZx zxjmflAZ2+thfbWH4ikjQZ1#8{eAlM=Qgf8PPU4-*{=z#sqH|u7qdLCa=-m%4jsgUU ziw2gC>74mCG1@$uZ%E;LsH*B8QUMQSD>bW5FGb^o%(hm?!goYaLSc#rVO)@-G9Ef6 z$KWzM+yR1pAx9ntOaKrH76u3d1^x4I1O#A0L1R&{V~dKZa*$K1IItO;I39jK?LY&8 zBEU;Q497#rfu!HR;Tx2WUe>ir!c>aJ7^3jp)yp6eZQTc+!B)H>x8*MlYjZwz)e;Ur z@zs`l!z9=8@xdco*!p$$V5gw`lj`zxG21P2#wI0|WRTtiTo#8CYcbn3iC|T6y=T|G z^3XI~WHV(gHc^rrPMe>A+N0UVx)KR#Xc=14%;9toy#h)pF!_*DGam3YGATG?Wj|JA zjVlqflVm^EQT_5L46nwBF|3PS((_6(3#1&Y0-YoYYL&f}HTGN0hA&!v{@f74JNG^B z&@gZau&}@L4h4-#frZV^0R!PazKT*@#n{n5>kvflFtS9&#;IoVT@Fy+&w&dUO{+TZnlFkekuCWDB}E&}ECJ@~-GbQSHtXjY*mapgZttblw47kOQy?&r(h5 z>=3pj5Gbuy+o}|P)Me}d*(F`#EMb(}O&VD_qn`Bv8Bsmh`YqJ5NGN^ZJs`Ql8F^t! zaQxWX4?Ey(E8MNV=!e z*leFs=XaZOL_d|xLhaH22r})kr4!LSLgOcRAK@7D*#p*LEVGTF%-Uk49=9*Lsoe2Z$@!ViyPKp{=LSdm403bd>;}sLLl6Mp8je~ROt;|G;`Sa*B-oDU zP_Uip42@MdtP|4C5rl8s@j?}$w)(?5=X6=ycy8!k5*D*D6SJao45cCcs|oXj9@2h@ z62u{AZfIyIAUqN*5FX|pgpm6bc1)~zXbMV$zgJF{-ZL4-AGm@mWz52T1!>_Zm8n)j0a#^~z_BH3K{Amhk^)u8X z0{if6BOQs`5KC|m($D>>eUhT58*Lx1WiZ>!_}bMlA9kFmKL#lAJD)!u)pCU7VfrzPiR&m^xodpl2cEpygO&HD(`@ zD2>k*>)8TH9kXf<3Z=SL^Rg9H9lYfw)3PU9bHf1zp=#>_AsNgk6*bNh3-zTHbfk-u zJRYjnmfJoIEK56%+cH;TfDE4D=KbQ%4z4tL%Iu=@4gRM)>F>qt*(*>aZuN92mzNkGlLail_(!DrZ z-kd#nx*hlNjkDa2fo6$>=^k{er85^r8LxUtBY)De^cU0k+LL=(+VPnChh2THpB%;kr9OJypv;BL+3#A5t!1^xxSmDTI$WRjh5t)W*|F zzYO&_RUHWxE7BOY^<@us%W9q5u5iWpTK_QS;3wI*TgY=?wXK^DK8_bJ6HaUrS^K_o z+TcbV`j+X>RkNqe8&*k3s7Ke`s>fe1wBe{kyVrYw^N-~GPpJLQxXSe3YS04jVI}J$ ziv>R%RBIWpf|qa4QuNlFLvot++%(4QC-e)gm-B@rzAIwV&V3g-a^!90qiFz6o6^Uj zo^d?>HQnUQAr~f>CuiDLNtLU@7LA>?5lQA<`~3&K-_>@e>6pDpJo`|gIQ(10Td9_v zD=^b_g}Qx^4NepDXZ=F0?oYzDeA{AQpXi=5$t!-@o5=sX>K^fr6qZ%_4{=Fv1Nswc z%fX~f!*zixj2n*aHA$vV21S;vn)?k1hx~A!#-srCPLkeA{{bz*TJJy^Pd#+&7W96h-zebu^B{-#?}e)~*$3EuVD%dN3i{X(ZYQtr<7f5dnq57Iw0*B=&=a1g{I40lalwr8P98mL@5 zs-5!;^L(z6apPukfRy_$*A#3UOrJe$v>AB7a~9%y5p%SelLPxs`2fMQjHQ8JU4A0nR`=m=N5{XV(=WV$YcthURym!83mFr)ny(&4sWkA^AkFF< zxNVP>#LxdT4;9k+pWT_GuHd5_Lc`;-lu3`IEpFsy|7_~5*sM9soV=b%JSpcirr;X=Wf_IpV+wCO8F`v!rc&21AmlW#5~RwljMNAcj|Zqsged>fy%14S>(OMjc2=x^r! zmz8D4AAZP=QS3#EE*V-w4pm@tYp76M^Oa_mSS@$=t}~d=eDm?0cCJs(wZ3R8X&lK? zN!aRyBVW>6nPL*qMROX{Z`gV;c^R$uz`KReFE679`5KIL)nu&Mrf}|Kt1hMFuB_l@ zSyvL(uDt86&c=r{v<(bfV9CyeZ6us=b^6zW8LlWL0-wB%ZH&N&2MSU7X+F;YXa?xNxFN@!-O{Az2J9u zfHgbo@A5#f`*pRy?@Xq0K=#nRp2*`(|u#!{5rc`a~Fl8dn8m!*zxHZSz>YGEhEd@ z(SrDSCS#>E-OV`p2q4_UD4Z#%OQ%WnLSf@Vz~d zo=Q_?!ERF`@1?fZf1>-JawXgKKLI96fZ?RH{0QPHA^r$0;`xs4?CZAeV?xY!r@sUB z?}EO!09K3t0RR6}Puz{6qKaWnHh6EJAt~AdyXA5l7kNGRLoLL^%_O$Qu6;6ek&(KU zs|SDqyphlvH_X`)!=g_QU$Zgd?vbBOWplTDc6*&bUtTqkmask5osISW%YILHRo3S& zxt*kzJ3y+QHCS!KG$%dE(74?qD0L(I<-aiLR)+u+|2-+m@{rc@`F_+|7<_Y4Bnrqp)qwu{SR^AO>eCGD%A>FcXoBQwZrYs$Ay z$Gi<-Ei8gjNhS^N0LW4w7Pv$X?f{xpwZOr)vad?BRN8zAvsM+hHM7nl)n8+4U9nT# z(aL-ZznnM3KT2d8w+d#e_1T_k99*n67N~G!$-hOcc~}$JL^H_M(lnpT_gccuLP##X zIj|OkbVt#{P4DwpOUa$gb1Vk6G8U>%%ZVwtG{}NIElSV{D>y*Fm~Ma-mANn9M{lbz z`0~t$np$7ub+wkN(KZMycbel`0z`L_UNyc?`pl)fEkH7!IbuNC_NZY*gYpE(#oal< z^_lM^dMRT`h0oP~CKiK>>tR=OhIwl}W3Vftzi5Rs@xwUS^87FQdMzyQ@m(#7+dGQO zfF>a0@S>uns4*JPk@Rs{WBqe(96dC&L&4y@auCi!nSskZ6SVDw?1O#5#PaKalSgNX z+i7rpZ0TD>Jw*LYi#+8!?lrv24XSYSGbuQk(siMcjc~7;f=2iA6&_4me3zeKYJwRvShL1Wgz|uty%xQbG+c;zSg`r!&{ieyS0v!}_4l|>Rtf-4v z#zVmwWP|Wv1K}#aW>0C+*TuzUt0Di&r%d{QgQY^(R>BUgD5tk# z0+oYaq~u(0R(C4qg9>`3(wJ?nt6O;|Pnmg+m;O}nUW*6l1xvwrIi30%W6OHh5q%62 z9-j7Q*-}nf(QY=O^sLxd|D=2mF(SUeYbJf$x;Rv0qL9-(dzPDt?mGZ(>@;2Do^h$8 zZGpVOY4AFRJerxrXHW^&&>m<|6qf+yNk049pmiU=1i3I4H>d)sOHF zASQr<9dfnMDeCASpLK{)Mb0*|bY%3$mwF-;cYuK?bFgfwq?wby3K}tDJ0(1hjmeN~ z+FNsYf?`&W=q#ni}uoJb|_UOZ}&gqK3jy%I$X5iJov@>^$^vz+86tj%eGZbaH1 z?JWDDI2YSN{%KFxB(Cq3o>L*WhGb zO@R;0?-=+;p9GV~(PSTnLzmY2`NiV`h6=qMy|8h3$-@?<{lXM`-|1B*tpj5);lQp) zm86ba7%=)p1@uqDbXIY7juRydF{&#pMyVcPj&TCEV35c$aH1daJ7{2ZNL`3^0pK+X z^JBE$U}+h}sM0!FdJBVL0nkq*m{`b>)=5OF^Ijt}6BNy;I~s2R;H6p`CB}3jYyje? z+g&QgSjZUM2mmoa5QmAHx|xv5t#lqVJrL?Wtr~8J-NP+ z6@?A}R`5VSeqdoB(-t-}I0sK+C%G#L7VljPu@E`;^n=G;s5O{TQdCOjQRj_dHtjli zi*iO$3>z&~^mmD<25Ste$vAOHh$_^YD(bJ*;wwbqGckxb)AC1;mFyIzvgj0-E$T1| zrJn+V5G>fzy1xKKZbf^iK@KngFCTXYj1(i8&MFF0HQoVD>r@hf@5-gSQ&S}#TjVZfz2dMHU z!H!FAkADL8i>~zof3XmU|DBcC&!UbqRh)!Erq{uZy&0c4bsN>oIQ%0r5G4F7DU<>@ zuv~AA;XJFpky5m;ylNEP(1pO@Xvc!jkj{>%o2*Db^vjWWNnA8w@C-2s)4(qU77M@c z_8p+BLskMDg}QUyg>qv04IN|Q2o6CUq&p7Y8-zw^dJWY+wmb^274WzCz^xgQ{a42R z=!_dT_+H#Mw_+Zj zS1#eh?2ny*SU_?(s}NJ8N|;@1|J;vyVCS9Ar9Z?G$+AV2>UZfMjB(pB25@i{qH|Bb z#h~%{VuxayA#?42^p^KV;vMhBApOW&B0<_k^dyxJ@j?8N2*}iMWKi@rk;A4(me3_) ze~n@YAdz*S<^B!E?^|V`5rK|<)qibhgQEy1l|FVFj6P;Gb7erAp(Ir z;WkwL6W&7#VMFc!T>51C`ab!=4$q2Ur*D6^K0;gv{M4ai-#C0Wc5VebJOj3JnTla5 zE5eCeTKlK%`w(FCEfS%~veO#~Qa2>wqufiS%y;$M4{Jek5DTEK;s%gfCkpx!m11d| zd=Pz45fu*KSzK|%$;AB)xJ-1t0kZfUor&!L-(P6j+eO?H1gFKGmVNO4|7O*G$mGYy z{Kmt+FM-cjgxlfA4cX~U_&XW6JkIH#V};V+0n|nM7AUmG|5B2KU|gc>$tgeBETLg+ zP&iTH^$(lAl7%LRtK7DL;=p6~NcqNrm*RM?dX6KT!_F|%jBfrEivw`pVyweI)CdCU z6>)x&`X&b9Y!1f43P28Qzz84jfwOxy71+HpIqcA;v`|8lij-Pk%l7#tdW!Z4zSvx1 z_&MX1R8h596gM_rnxqv!#S9s}8Apv_YD1zW2=ZNm@x32BddzkLoooi0=(D=?7PAu3Tt`=Fm-^eGd_5?Q$BEInbVU$_j

_N5ghq(Lu`41i;NA_X0oumjY0!00SDP+|^exn3Z_kX_^{^b!;CNS z4Fqig_{#M9y-#?L z!2YLPEFi#E4~`u=7w9B*6#kd|Ld%-}r5Ns|LYRe1bQ}9b?BWR*UQR0hNAgBPk7bzE z+Z53(1C@Pa-ng$|zlSS;$2Y>+ri}yFffxF{9JrztMc!&~97h0_=RM5v1igm$JDi|| zZ!e5Nd?o#UA`ro^h>iM;t!I;*2dCf+x_0U!Nb`ZjIihUhfN=F|We$odA+iKN7vw2| zxW>>Lj3NS2(=@<}VE^N605a^Q;Jt%$$a%+m9f1<##Z_f6jjU*VP2mUT55JL+$1?*i zt=;%(rJ}L@JGbQA`MQ6r_F!_Z5Y@kdL@-4$rg8i4J4_771A>rOOcj`1V{gZPp#KYR z2%qXQ(e2{`2AWs4*UbJ+nx_)MSyYOsBhDPNWB_t!1VkzgY%2(S4$to6rVTnw#6zG^g$u|P z-}D1w^|(GLKulILqOdZQp zI>ex){mLl&G}$iND=C=Qpm7x0`)QhC#?0O$p-j^|GR?{`>G`1_ehuOhk4D@&N5-d- zOW){+&UZr3Il&0gZ%Cx!!iSCv)R^U(55wWV8fx-Y(1%d=7S+X(2=@#j;;#T0`vZcWIz2R%L@K0YfqY&wB1g)I z#@aO^On|Rg)oZkX!Yq|E0;2en*0lu#_K*_!VLG@x*EbM@g3PdufT$(M#*Fle7=>&M zDck&_7*EZ~ku*8g+4}s~BgN(uBCUOcj8;%96a*~-6ewxn!x$^$VF>(@5VG4Je{#s} zT!m=>%=q~sfIH{15Ok;|5OJV--~_b^(O*8E0x||Oz}4gxxe~&Zs8MJxdEN9=t}Y@* zYx2u4KkB30#1CRY2E44U>f@`8Up#_VPbQ}WDhNotAXSFkVEPR&7UnrvbU7IuGi4h~ ztBls-_Q52lXo>A$=_9Mvg$+o`BP~|Y53^xqbL8BZu$e#!fYzldQJnjF03xh;MN z02r;RPJVvpe?3BcXk@ZMIRDSvya%U%wz3j86kZ#D#8T%f}>){+`17a5~&gR5=(OF<)WgW zcB+#e-*V?Pootshjz@`@RncB=eD(Z3xBrzCT+x%3r4Mvq3rx z2_DD(%;$VBo(vHmfB0=o;Li;Ck69R9f1j}HPhGe|<%xf%Ke7Cs!jr+@miw*G%1?GcCm(-~I{NT4^OLaO>hFv15u7?u0we$2;pYthRvGZ5 zGLl%|ssS2`%UG7{`1TIa84DOMJcO41x30fT@jdPe66OK~!b9TBe*~IC!d@^b*dg&Q zj*tr}WB)30mClhR2e#kit}yNZ=o4fnB&y>bDKk^kfG4aH@E*?M^TO7=3@4)~<$H~y zbw10;4)vKPL|I$y1^d*)75Yb-^>!Z#uEobEHeMS`K#%#pzI>GCS>K~I*NCUX(O>OB z`n;>8sAEz^UFp6nat{N;rZxSV5?p&Prqsi;NKdx`)l>^ZA=4I#mO;dgz(cWFza@Eq z#dW6LWY#PF9ugFiD-5FiCfYFK>V9{@0XOo~7;!7>85OIp3Y}*%nNv_Rs=LoSoP2s( zb(GFIzfpuKiJRfd9#pZLYPsZzCD)2K?A{yO% z)G}G}_RR86rf&!7TNYMP>6(wbU5NeCvLUgBa#@ZDMjAt-9~i^@lj(t3?H6h5y3g6* zh%Pm`D8jSxbgs02N*M?Rg#@|z`z^xj{^l?KdsvZwR@KO%N+-F)(hn)$M|EM`kOfdJ zCc|PF$?sQqsDvxF0YM<5jXZps_XiGH3hTT4H}r(NGbgruH~}q+FpUl8&C;d$b9%SI zD9vB0*1nEg81Y!neDt2^6IOQ0<-KW&@i2d&yxP-}>8W86s030BCIZ%;*OK?`8Eq`< z+D*Mh-^tW~Lf~56My&xX5`IC#P46_`NIc9tZ%skdy;u`PU>jqT(g_oq6J0b2)*b%L zar)|2f`hoDz)CZyK4Fp1QxSk+DO$()AVzoqMsYsG_5sW;OK+OIC%M@|p49=&D<2Sv zS5-W$E@07;7EoZsINt72C9CrUQ3o_l$HqZZ7WsN0b39+|JUO3)UKZF8O{|ABJ`sE^ z>mCuhom@F}Sl~EQj}F+iJltQ_?1c~DxSeFQ_w?e~67L9FN zU+Q3O&!aPus1&ls6qK7QrQ#>65nb+qp&9p^Ju<4c&?sAxlaqyfXbuRPWT9!j8BB{h zyrnRSFlqRv0yoriACJh40r^P<O-dLJ+klA^ zLY#(|%X-t>MsQ3zrQHSNds&6B+mJ{FftLNAVyt`3;~gaf|EH;ot(-D zU-Z##&;b|CuoU|;4PKFof}E0RF@Jp--Ezp3H`{Dz<~R=+3LKqm6su9_XVmQHk16rM z?|=tPMLu2_Zxj_bI~pnxZWY_~Meu8`(kKG*$Rj!7M`{$gP&87s2H=}QEGr;aczQnN z8IdM4aYWhRBS}2}xVRA2{uf(_O_C-G^^zDvd3=wX?2M@}QBhmOkEr^nhpc!&R-MJ` z_L#%TY1@1{%)yzI*O3!RY7{W@dh7I9YGz0_4$Q6PV*CjVTkEwbhmKr#AsK= z7eX)x7~lccnp`mDpTbNuE97<>o#+`IgN@C^HF{#%dz=g8P*`XJ7`-nN<04?2sERu2 z=trrU5Ga6~Y8g-{RnBBc;5t)M9p-Gk^)-ybUX|ArpBQ6~xRP=G;7FFGae8eB(AW7Z zJXA4rC8q;NLZ?g8g@GD)U%+4zj3Cf7o*}fDX^J8xEzXm&_~j@b;in7^$n>lz~X zn!XtP!A!@>U=1)D~cGg=7(s0X9)M{wKjVeH4 z+ncUej7MkCX7Pu205n1@unj!b_@x1tq)J19irz!w%oc?ETkAKqj&l5`G`z2KU z4-4Io2ARj9=6(8hbW4VQTL0H}n5kB)?Cf84_e-X#eP+ z%ab9DSN+I(crkL;%kTk4b?3SQ)3zho5=4{o&xh7d7pnJ8X;JeqNYBVHI^IZ4h6+)U zT?D%s7cBByVRZ$?+I7D}eSHTA?Y?<%B9c7j^RfP_5NkTeWVc)xD=U!mP}$JS??r%- z7yLC-QLH(gPkV{|qvxY7*}|wTlxNR66Lvd>oP3KA%n@do^QryBu%0v^WJ%%EGQgP0 zyzvq@kV$;5`CswyJM-UVtPOAK>;At1>iY!7S@_AbqyGhZqGQt@UNcB2vz`*utddtq zn*Q58VnoIorwC*dCGmio!){o1AWt%@k*>vSe}_{jda+|fpHlg~*=u34E&J5#w)C(1 z?ZZoYh`SLRB|EYg7HqP~`p1c25#wpnu3+8UzqWqBk4O>O9HFfoByF5|Ejmj;VAjNt$(vkN#a$`BgssP=C9eh{(;Bq zLYS~_(q>m-0ZnO^K8fm^LWQRrSXsxS+8^a-r!)PeC}s3&D*ogdU}! zU}qY5Q_C11d{Ah6^NZxFDS(WXpp2eKR?VF~H9tHxA>j%yLi(EXUB?4?QbD@<>!Dg~ znVO3i-7hD@JtyY7V<(E)o~bvTLyq4Nm59E>Spf$Yxa9SLEKv?q9KGw;6VaJKH{teb zZ;3Mdgw%U(L}EiPpRD_`=HDdSZx@_{nN@b?)hP51Dg{#|t;f$@j+M)7BO7S? z^r-w)essat&1zn=vfC8d`?as~?~=4afLUq>YaI4%f4Kt~wAIARsQDUbcPb3ql1iLM zv^ux;eYI2RI$E%et8cI$z&SSEDDaT@Rv0&Rc32y?f_~-J-`O(TMEzQtoZ{x zdU+UR?^JAJt^Ii4|L?bm{_8scICrHk2)>DtW@KtP_DXLl)7&RZR&{)}Q3rQ9caM^4 z02|P1ggwNi<8(N*b$nAaI!LMap;8i^48%3$l;=J6Pxog!ke9;Q;jK+#(7QV9)oW+9 zrmMoRt`=6`t)IKfZCQV#2CTBSRQT1C^`sKo8y1xQYEv zJ==z5B`8mW2`2W&-Io)BhGXA;Y}9uE2JlMRA-0|6EkRihcg@qf zsL7GE8P5M$<>1D9?)htPQDuLOoASI`ZO8rm@rZ3#Gv76~w=qUab9dz_;JOa^^MmdtgM)-NYhHn`VLEC+D4mKJDl70Q11gmZ+cGmi` z|ADf5`iD7Xa%Buz4}S>tR*^t_XOb2<0j3sa%=;ad&#*5cJEwUy}s@qtX4gP|+P8D;b~vi*!)u4Zn654P#U<1GJIYW0or z?31T5hakaNhKrKFSJ>r4uaPh>N2afqG_if>gtDTSLse;s-sNqW`jSoEYZ@nwqv!NX zneIN~L74$!S6tigfvod8;w`qwXnINO*8MF=S5?YG(I_v1vRb#{5B?!0YMz#|QDI$q z{de^I8Dr2~UXrfIkbNrjwz~6yVs8efh*jbZOuU+J`E?PNb2ynrj}q=>^1|kp>p1pa zpXjHdY0|e52iR}AY~{%~X)ognJ`A#7r#Zqq_(CF7Ho6R%3?nQ=a6Zwc-T|yb4_sIW z%`)ZFcJmaF2)dD-l9pE3hl2dgAjBLrjY>T>P ziZ|_I?a+;3cW6BqQ(e->Yz-K56G2}WJ5msji75DTJyi+|xi-F5Nr#_!<9nG~e``Ya z=0du;4eY~|Y9D{Z-8t1lcOc$@)=9SX^U!&((61a+dBaEWE{1gOQj)9oF1|CK1k`Vs zNHWn*#XQfFS8IvP1ORH#9PW$?;u)Sk7SwQVxc|g5zaJ?7o6-JR?B8w5*dKDBoc!I~ z{>f5Lq%82zdaWjUEsWU!Mof1g7x0bnF*knc6t~5^kv^ablm%$9%1Ydo6Igs z%9F8gv5OSo4rjE9W_7pxJ;j;yT3#!=@wuYEE{3CyC%Y;4*Z+quzf6!NjR8~q_mln7 zGe@gKAE2Dvey(K9C5{Ww>N*;&cyY?y=RuXg=ab})g9sf$fK%CT;WfG_P9GLPNliEE zv={##K&comS00w~3X?pHnXT&7Z`$9L(GS2_jG-RJVFc(CM~I527Wp#GqcC41DlRnTEZ=_-fuPgA0nMM@v7?&<480*=3-J+;w&1`J0B!71cIT%ChmTz;!@<4@1kFBfJbO}E|w!J zuJJ1`-jrJU=@(Hd1J$Rs@~ZK~qLNMQ)a(QDQ+#(|0hIgMiRbKhsPa*RB!z zf$-JMg&u~NKq&HV=GVL_34|V%NMdUN*rUC!nTHWU2~g>QV}h=b==Zb7r}9e1lwtIp zRZU_rlJ;=^szFJQ1_JD71XYw1&%Vl< z9zt@28byrWCss95+MRdvuaNV#g2*5EQFk85h9^9jC1!HLphi&91DRp%e z(GUGo442yaJAj5jXjdc>O4<}Pu)wReP2HwI6r?EDVP~_Mry(_9e^)%kCHTH;TmvIL zc`?*HEE;xjd=lCOK0Iksn62q*o|KBL*O;Bam={3T0?DqQk%UJ*k2uJ+p2tX6EhT}> zBDmU6;rxvAC)clTro2txd1kND$+bl$OPmT*ZY<$l^cH1^C8HnjgE`7CC#rP>zKB5zdFdJLFK(q+KCLJT zNUVMZXVX20n*}f5V?P|Bu@|N$V@FR!g+mS#!751Ep%1pT#koGE$HPm5TQ*~qir_;q zt@2M>(YT>bjAU|!m(4SRQ8F?bH$jl4jOB>VB7wc>6Hb&1(?;awL-3t&lIbMP?vn$E zZp1La*}(PK3rfY+9t_9u7rqs)%f-=v(%hEn$9paMu5~41{WLvki89bgV1-8*qaw%Rhk&r8rct(R*Q?=_u>Cgc^oT`zi z2F&qz{B%rJrWd@I+zT1Nb}6Yw^=TN~bAUC0hLM0_dfq-XScB0)!NH4|4D#O_dVF4y z4W-?*-!i}vcjJm1H7P5TJk5h|(l<8VevA&Q^a>i+y$JFTXyRsU-oqwfk%%I|$G;5o zuO_Yx6Kx0&iViNqj0d8T`y)uBj?`%QXuK9PP@M{oB_3-i!dH=nZXN33i&CKcNFN|Z z;bs?SRT-ao3N3)Rm4dWp|~iSIV);Fm(h! zktCp5uFr-qy3-6c!oJJtrH)x+@L~w9v9V@c<_b_po^6Tzh(rrkk~%^vL=+81XbUfq z2`Y8%!qr?REYt}MC638qKNSIZWKKWH-}}ng6{m9}WK#;v?4WB3f#EVFZv6BA13K)F AssI20 literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index 0f0cbac..269cf98 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,7 +11,11 @@ #include #include #include +#include #include "testing_helpers.hpp" +#include +using namespace std; + #define maxlength (1 << 25) int a[maxlength], b[maxlength], c[maxlength]; int main(int argc, char* argv[]) { @@ -25,104 +29,114 @@ for (int SIZE = 1 << 8; SIZE < maxlength; SIZE <<= 1) printf("\n"); printf("****************\n"); - printf("** SCAN TESTS **\n"); + printf("** RADIX SORT TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, SIZE - 1); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + zeroArray(SIZE, b); + printDesc("Radix Sort"); + printf("Array Size = %d\n", SIZE); + StreamCompaction::Radix::RadixSort(SIZE, b, a); + printf("Unsorted Array:\n"); + printArray(SIZE, a, true); + printf("Sorted Array:\n"); + printArray(SIZE, b, true); + sort(a, a + SIZE); + printCmpResult(SIZE, b, a); + + //zeroArray(SIZE, b); + //printDesc("cpu scan, power-of-two"); + //StreamCompaction::CPU::scan(SIZE, b, a); + //printArray(SIZE, b, true); + + //zeroArray(SIZE, c); + //printDesc("cpu scan, non-power-of-two"); + //StreamCompaction::CPU::scan(NPOT, c, a); + //printArray(NPOT, b, true); + //printCmpResult(NPOT, b, c); + + //zeroArray(SIZE, c); + //printDesc("naive scan, power-of-two"); + //StreamCompaction::Naive::scan(SIZE, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("naive scan, non-power-of-two"); + //StreamCompaction::Naive::scan(NPOT, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(NPOT, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient scan, power-of-two"); + //StreamCompaction::Efficient::scan(SIZE, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient scan, non-power-of-two"); + //StreamCompaction::Efficient::scan(NPOT, c, a); + ////printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + //zeroArray(SIZE, c); + //printDesc("thrust scan, power-of-two"); + //StreamCompaction::Thrust::scan(SIZE, c, a); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("thrust scan, non-power-of-two"); + //StreamCompaction::Thrust::scan(NPOT, c, a); + ////printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + //printf("\n"); + //printf("*****************************\n"); + //printf("** STREAM COMPACTION TESTS **\n"); + //printf("*****************************\n"); + + //// Compaction tests + + //genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + //a[SIZE - 1] = 0; + //printArray(SIZE, a, true); + + //int count, expectedCount, expectedNPOT; + + //zeroArray(SIZE, b); + //printDesc("cpu compact without scan, power-of-two"); + //count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + //expectedCount = count; + //printArray(count, b, true); + //printCmpLenResult(count, expectedCount, b, b); + + //zeroArray(SIZE, c); + //printDesc("cpu compact without scan, non-power-of-two"); + //count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + //expectedNPOT = count; //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + //printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + //zeroArray(SIZE, c); + //printDesc("cpu compact with scan"); + //count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + //printCmpLenResult(count, expectedCount, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient compact, power-of-two"); + //count = StreamCompaction::Efficient::compact(SIZE, c, a); + ////printArray(count, c, true); + //printCmpLenResult(count, expectedCount, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient compact, non-power-of-two"); + //count = StreamCompaction::Efficient::compact(NPOT, c, a); + ////printArray(count, c, true); + //printCmpLenResult(count, expectedNPOT, b, c); } system("pause"); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index f6b572f..adda3cb 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,7 +1,7 @@ #pragma once #include - +#include template int cmpArrays(int n, T *a, T *b) { for (int i = 0; i < n; i++) { @@ -40,7 +40,7 @@ void zeroArray(int n, int *a) { } void genArray(int n, int *a, int maxval) { - srand(0); + srand(time(NULL)); for (int i = 0; i < n; i++) { a[i] = rand() % maxval; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..be14bcb 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.cu" + "radix.h" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index d793ff2..c53b355 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -115,7 +115,7 @@ int compactWithScan(int n, int *odata, const int *idata) { auto end = std::chrono::system_clock::now(); //auto duration = std::chrono::duration_cast(end - start); std::chrono::duration duration = end - start; - std::cout << duration.count() << std::endl; + //std::cout << duration.count() << std::endl; //FILE* fp = fopen("efficient.txt", "a+"); //fprintf(fp, "%d %f\n", ilog2ceil(n), duration.count()); //fclose(fp); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 3e2eb31..0ee0002 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -18,7 +18,7 @@ int *dev_ScanResult; int *dev_OutputData; int *dev_total; -int threadPerBlock = 1024; +int threadPerBlock = 64; int BlockNum; //__global__ void CudaUpSweep(int d, int *data, int addTimes) //{ diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..214f52d --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,184 @@ +#include +#include +#include "common.h" +#include "naive.h" + +namespace StreamCompaction { +namespace Radix { + +int *dev_Data[2]; +int *dev_bArray; +int *dev_eArray; +int *dev_fArray; +int *dev_tArray; +int *dev_dArray; + +__global__ void CudaUpSweep(int d, int *data, int addTimes) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + data[(thid + 1) * (1 << (d + 1)) - 1] += data[(thid + 1) * (1 << (d + 1)) - 1 - (1 << d)]; +} + +__global__ void CudaDownSweep(int d, int *data, int addTimes) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= addTimes) + return; + int m = (thid + 1) * (1 << (d + 1)); + int temp = data[m - 1 - (1 << d)]; + data[m - 1 - (1 << d)] = data[m - 1]; + data[m - 1] += temp; +} + +__global__ void CudaGetBEArray(int pass, int *in, int *out1, int *out2, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out1[thid] = (in[thid] >> pass) & 1; + out2[thid] = out1[thid] ^ 1; +} + +__global__ void CudaGetTArray(int *in, int *e, int *out, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + int totalFalses = in[n - 1] + e[n - 1]; + if (thid >= n) + return; + out[thid] = thid - in[thid] + totalFalses; +} + +__global__ void CudaGetDArray(int *in1, int *in2, int *in3, int *out, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out[thid] = in1[thid] ? in2[thid] : in3[thid]; +} + +__global__ void CudaGetResult(int *in, int *out, int *data, int n) +{ + int thid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thid >= n) + return; + out[in[thid]] = data[thid]; +} + +void RadixSort(int n, int *odata, int *idata) +{ + //int n = 8; + //int idata[8] = {4,7,2,6,3,5,1,0}; + //int odata[8]; + + int threadPerBlock = 512; + int BlockNum; + int nCeilLog = ilog2ceil(n); + int nLength = 1 << nCeilLog; + int pout = 1; + cudaMalloc((void**)&dev_Data[0], n * sizeof(int)); + cudaMalloc((void**)&dev_Data[1], n * sizeof(int)); + cudaMalloc((void**)&dev_bArray, n * sizeof(int)); + cudaMalloc((void**)&dev_eArray, nLength * sizeof(int)); + cudaMalloc((void**)&dev_fArray, nLength * sizeof(int)); + cudaMalloc((void**)&dev_tArray, n * sizeof(int)); + cudaMalloc((void**)&dev_dArray, n * sizeof(int)); + cudaMemset(dev_eArray, 0, nLength * sizeof(int)); + cudaMemcpy(dev_Data[0], idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + checkCUDAError("cudaMalloc failed!"); + + int maxn = 0; + for (int i = 0; i < n; i++) + if (idata[i] > maxn) + maxn = idata[i]; +float time_elapsed = 0.0f; +cudaEvent_t start,stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +cudaEventRecord( start,0); + int pass = 0; + while (1) + { + int pin = pout ^ 1; + if ((maxn >> pass) == 0) + break; + BlockNum = n / threadPerBlock + 1; + CudaGetBEArray<<>>(pass, dev_Data[pin], dev_bArray, dev_eArray, n); + cudaMemcpy(dev_fArray, dev_eArray, nLength * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy device to device failed!"); + + for (int i = 0; i < nCeilLog; i++) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaUpSweep<<>>(i, dev_fArray, addTimes); + } + + cudaMemset(dev_fArray + nLength - 1, 0, sizeof(int)); + checkCUDAError("cudaMemset failed!"); + for (int i = nCeilLog - 1; i >= 0; i--) + { + int addTimes = 1 << (nCeilLog - 1 - i); + BlockNum = addTimes / threadPerBlock + 1; + CudaDownSweep<<>>(i, dev_fArray, addTimes); + } + +//printf("F:"); +//cudaMemcpy(odata, dev_fArray, sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); + + BlockNum = n / threadPerBlock + 1; + CudaGetTArray<<>>(dev_fArray, dev_eArray, dev_tArray, n); + +//printf("T:"); +//cudaMemcpy(odata, dev_tArray, sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); + + CudaGetDArray<<>>(dev_bArray, dev_tArray, dev_fArray, dev_dArray, n); + CudaGetResult<<>>(dev_dArray, dev_Data[pout], dev_Data[pin], n); +//printf("D:"); +//cudaMemcpy(odata, dev_dArray, sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); +//printf("output:"); +//cudaMemcpy(odata, dev_Data[pout], sizeof(int) * n, cudaMemcpyDeviceToHost); +//checkCUDAError("cudaMemcpy to host failed!"); +//for (int i = 0; i < n; i++) +// printf("%d ", odata[i]); +//printf("\n"); + pass++; + pout ^= 1; + } +cudaEventRecord( stop,0); +cudaEventSynchronize(start); +cudaEventSynchronize(stop); +cudaEventElapsedTime(&time_elapsed,start,stop); +printf("Time:%f ms\n", time_elapsed); + cudaMemcpy(odata, dev_Data[pout ^ 1], sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy to host failed!"); + //for (int i = 0; i < n; i++) + // printf("%d ", odata[i]); + //printf("\n"); + + cudaFree(dev_Data[0]); + cudaFree(dev_Data[1]); + cudaFree(dev_bArray); + cudaFree(dev_eArray); + cudaFree(dev_fArray); + cudaFree(dev_tArray); + cudaFree(dev_dArray); +} + + +} +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..c9cf412 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,7 @@ +#pragma once + +namespace StreamCompaction { +namespace Radix { + void RadixSort(int n, int *odata, int *idata); +} +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index da69007..f16aa6f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -10,8 +10,6 @@ namespace StreamCompaction { namespace Thrust { -int *dev_Data; -int *dev_OutputData; /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -19,12 +17,18 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); -auto start = std::chrono::system_clock::now(); - thrust::exclusive_scan(idata, idata + n, odata); + + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(odata, odata + n); + auto start = std::chrono::system_clock::now(); + //thrust::exclusive_scan(idata, idata + n, odata); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); auto end = std::chrono::system_clock::now(); + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); + //auto duration = std::chrono::duration_cast(end - start); auto duration = std::chrono::duration_cast(end-start).count(); - std::cout << duration << std::endl; + //std::cout << duration << std::endl; FILE* fp = fopen("efficient.txt", "a+"); fprintf(fp, "%d %I64d\n", ilog2ceil(n), duration); fclose(fp); From 07a15a7116f8dc65484a9aed27d36ccb98ee56cd Mon Sep 17 00:00:00 2001 From: immiao Date: Tue, 27 Sep 2016 13:54:50 -0400 Subject: [PATCH 8/8] Update readme.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 49f1dac..b4b1320 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,8 @@ See Performance Analysis. See Performance Analysis. +As mentioned in GPU Gem 3, I guess thrust::exclusive_scan is using shared memory instread of global memory in my implementation. Accessing to shared memory in a block is more efficient than accessing to global memory. + - **Write a brief explanation of the phenomena you see here.** The problem of non-optimized efficient scan is that lots of threads are doing nothing because we're examining if the index is divisible in the thread. The solution is to eliminate these threads. Actually we're able to access the useful indices of data directly by deducing some formulas using the index of thread.