CIS565-Fall-2022 · eyadNabeel · Sep 20, 2022
diff --git a/README.md b/README.md
@@ -3,12 +3,23 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
+* Eyad Almoamen
+  * [LinkedIn](https://www.linkedin.com/in/eyadalmoamen/), [personal website](https://eyadnabeel.com)
+* Tested on: Windows 11, i7-10750H CPU @ 2.60GHz 2.59 GHz 16GB, RTX 2070 Super Max-Q Design 8GB (Personal Computer)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Introduction
+======================
+I implemented exclusive scan on the CPU and on the GPU using both the naive and work-efficient methods. I've also implemented stream compaction
+
+Analysis
+======================
+**Effect of Block Size on performance**
+I ran the algorithms with variation in block size on arrays of size n = 2^14 elements, and the following graph shows the results:
+
+![](img/blocksize.png)
+
+There doesn't seem to be any sort of conclusive relation between blocksize and performance.
 
+**Effect of number of elements on performance**
+(I ran into a bug which rendered the algorithm incapable of running on arrays larger than 2^14, and therefore was not able to produce any meaningful results especially in comparison with the cpu)
diff --git a/img/blocksize.png b/img/blocksize.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 18; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,21 +64,21 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include <device_launch_parameters.h>
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -24,6 +25,17 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx >= n) {
+                return;
+            }
+
+            if (idata[idx] > 0) {
+                bools[idx] = 1;
+            }
+            else {
+                bools[idx] = 0;
+            }
         }
 
         /**
@@ -33,6 +45,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx >= n) {
+                return;
+            }
+
+            if (bools[idx] == 1) {
+                odata[indices[idx]] = idata[idx];
+            }
         }
 
     }

diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -10,6 +10,8 @@
 #include <chrono>
 #include <stdexcept>
 
+#define blockSize 512
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -20,6 +20,13 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int sum = 0;
+            odata[0] = 0;
+            sum += idata[0];
+            for (int i = 1; i < n; i++) {
+                odata[i] = sum;
+                sum += idata[i];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +38,16 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[count] = idata[i];
+                    count++;
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -43,8 +58,35 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int* bools = (int *)malloc(n * sizeof(int));
+            int* indices = (int*)malloc(n * sizeof(int));
+
+            for (int i = 0; i < n; i++) {
+                bools[i] = (idata[i] != 0) ? 1 : 0;
+            }
+            int sum = 0;
+            indices[0] = 0;
+            sum += bools[0];
+            for (int i = 1; i < n; i++) {
+                indices[i] = sum;
+                sum += bools[i];
+            }
+            memcpy(odata, indices, n * sizeof(int));
+            int count = indices[n - 1];
+
+            for (int i = 0; i < n; i++) {
+                if (bools[i] == 1) {
+                    odata[indices[i]] = idata[i];
+                }
+            }
+
+            free(bools);
+            free(indices);
+
+
+
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -2,6 +2,9 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <iostream>
+#include <device_launch_parameters.h>
+
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -15,10 +18,93 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+
+        __global__ void kernUpsweepStep(int n, int destStride, int srcStride, int *data) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            int actualIdx = (idx + 1) * destStride - 1;
+            if (actualIdx >= n) {
+                return;
+            }
+            data[actualIdx] += data[actualIdx - srcStride];
+        }
+
+        __global__ void kernDownsweepStep(int n, int destStride, int srcStride, int* data) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            int actualIdx = (idx + 1) * destStride - 1;
+            if (actualIdx >= n) {
+                return;
+            }
+            int temp = data[actualIdx - srcStride];
+            data[actualIdx - srcStride] = data[actualIdx];
+            data[actualIdx] += temp;
+        }
+
+        void scanWithoutTimer(int n, dim3 blocksPerGrid, int* dev_data) {
+            // TODO
+
+            for (int d = 0; d <= ilog2ceil(n); d++) {
+                kernUpsweepStep << <blocksPerGrid, blockSize >> > (n, std::pow(2, d + 1), std::pow(2, d), dev_data);
+                cudaDeviceSynchronize();
+            }
+
+            int zero = 0;
+            cudaMemcpy(dev_data + n - 1, &zero, sizeof(int), cudaMemcpyHostToDevice);
+
+            for (int d = ilog2ceil(n); d >= 0; d--) {
+                kernDownsweepStep << <blocksPerGrid, blockSize >> > (n, std::pow(2, d + 1), std::pow(2, d), dev_data);
+                cudaDeviceSynchronize();
+            }
+        }
+
+        int closestPower(int num) {
+            int i = 0;
+            while (num > std::pow(2, i)) {
+                i++;
+            }
+            return std::pow(2, i);
+        }
+
+        int* zeros(int num) {
+            int *arr = (int*)malloc(num * sizeof(int));
+            for (int i = 0; i < num; i++) {
+                arr[i] = 0;
+            }
+            return arr;
+        }
+
         void scan(int n, int *odata, const int *idata) {
+            int nPot = closestPower(n);
+
+            dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize);
+
+            int* dev_data;
+
+            cudaMalloc((void**)&dev_data, nPot * sizeof(int));
+            checkCUDAError("Error during cudaMalloc dev_data");
+
+            cudaMemcpy(dev_data + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("Error during cudaMemcpy idata ==> dev_data");
+
+            int* zero = zeros(n);
+
+            cudaMemcpy(dev_data, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("Error during cudaMemcpy zero ==> dev_data");
+
+            cudaDeviceSynchronize();
+
             timer().startGpuTimer();
-            // TODO
+
+            scanWithoutTimer(nPot, fullBlocksPerGrid, dev_data);
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_data + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("Error during cudaMemcpy odata");
+
+            cudaFree(dev_data);
+            checkCUDAError("Error during cudaFree dev_data");
+
+            free(zero);
         }
 
         /**
@@ -31,10 +117,71 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int nPot = closestPower(n);
+
+            dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize);
+
+            int* dev_idata, * dev_bools, * dev_indices, int* dev_odata;
+
+            cudaMalloc((void**)&dev_idata, nPot * sizeof(int));
+            checkCUDAError("Error during cudaMalloc dev_idata");
+
+            cudaMalloc((void**)&dev_bools, nPot * sizeof(int));
+            checkCUDAError("Error during cudaMalloc dev_bools");
+
+            cudaMalloc((void**)&dev_indices, nPot * sizeof(int));
+            checkCUDAError("Error during cudaMalloc dev_indices");
+
+            cudaMalloc((void**)&dev_odata, nPot * sizeof(int));
+            checkCUDAError("Error during cudaMalloc dev_odata");
+
+            cudaMemcpy(dev_idata + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("Error during cudaMemcpy dev_data");
+
+            int* zero = zeros(n);
+
+            cudaMemcpy(dev_idata, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("Error during cudaMemcpy zero ==> dev_data");
+
+            cudaDeviceSynchronize();
+
             timer().startGpuTimer();
-            // TODO
+            //// TODO
+            //
+
+            StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (nPot, dev_bools, dev_idata);
+
+            cudaMemcpy(dev_indices, dev_bools, nPot * sizeof(int), cudaMemcpyDeviceToDevice);
+            checkCUDAError("Error during cudaMemcpy dev_data");
+
+            scanWithoutTimer(nPot, fullBlocksPerGrid, dev_indices);
+            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata + nPot - n, dev_idata + nPot - n, dev_bools + nPot - n, dev_indices + nPot - n);
+
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(odata, dev_odata + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("Error during cudaMemcpy dev_odata");
+
+            int count = 0;
+            int lastbool = 0;
+            cudaMemcpy(&count, dev_indices + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastbool, dev_bools + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            count += lastbool;
+
+            cudaFree(dev_odata);
+            checkCUDAError("Error during cudaFree dev_odata");
+
+            cudaFree(dev_indices);
+            checkCUDAError("Error during cudaFree dev_indices");
+
+            cudaFree(dev_bools);
+            checkCUDAError("Error during cudaFree dev_bools");
+
+            cudaFree(dev_idata);
+            checkCUDAError("Error during cudaFree dev_idata");
+
+            return count;
         }
     }
 }