Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

submit #29

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,23 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)

### (TODO: Your README)
* Eyad Almoamen
* [LinkedIn](https://www.linkedin.com/in/eyadalmoamen/), [personal website](https://eyadnabeel.com)
* Tested on: Windows 11, i7-10750H CPU @ 2.60GHz 2.59 GHz 16GB, RTX 2070 Super Max-Q Design 8GB (Personal Computer)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
Introduction
======================
I implemented exclusive scan on the CPU and on the GPU using both the naive and work-efficient methods. I've also implemented stream compaction

Analysis
======================
**Effect of Block Size on performance**
I ran the algorithms with variation in block size on arrays of size n = 2^14 elements, and the following graph shows the results:

![](img/blocksize.png)

There doesn't seem to be any sort of conclusive relation between blocksize and performance.

**Effect of number of elements on performance**
(I ran into a bug which rendered the algorithm incapable of running on arrays larger than 2^14, and therefore was not able to produce any meaningful results especially in comparison with the cpu)
Binary file added img/blocksize.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 7 additions & 7 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 18; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
Expand All @@ -64,21 +64,21 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
Expand Down Expand Up @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

system("pause"); // stop Win32 console from closing on exit
Expand Down
20 changes: 20 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "common.h"
#include <device_launch_parameters.h>

void checkCUDAErrorFn(const char *msg, const char *file, int line) {
cudaError_t err = cudaGetLastError();
Expand All @@ -24,6 +25,17 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
if (idx >= n) {
return;
}

if (idata[idx] > 0) {
bools[idx] = 1;
}
else {
bools[idx] = 0;
}
}

/**
Expand All @@ -33,6 +45,14 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
if (idx >= n) {
return;
}

if (bools[idx] == 1) {
odata[indices[idx]] = idata[idx];
}
}

}
Expand Down
2 changes: 2 additions & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <chrono>
#include <stdexcept>

#define blockSize 512

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)

Expand Down
46 changes: 44 additions & 2 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int sum = 0;
odata[0] = 0;
sum += idata[0];
for (int i = 1; i < n; i++) {
odata[i] = sum;
sum += idata[i];
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +38,16 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int count = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[count] = idata[i];
count++;
}
}

timer().endCpuTimer();
return -1;
return count;
}

/**
Expand All @@ -43,8 +58,35 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int* bools = (int *)malloc(n * sizeof(int));
int* indices = (int*)malloc(n * sizeof(int));

for (int i = 0; i < n; i++) {
bools[i] = (idata[i] != 0) ? 1 : 0;
}
int sum = 0;
indices[0] = 0;
sum += bools[0];
for (int i = 1; i < n; i++) {
indices[i] = sum;
sum += bools[i];
}
memcpy(odata, indices, n * sizeof(int));
int count = indices[n - 1];

for (int i = 0; i < n; i++) {
if (bools[i] == 1) {
odata[indices[i]] = idata[i];
}
}

free(bools);
free(indices);



timer().endCpuTimer();
return -1;
return count;
}
}
}
153 changes: 150 additions & 3 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
#include <cuda_runtime.h>
#include "common.h"
#include "efficient.h"
#include <iostream>
#include <device_launch_parameters.h>


namespace StreamCompaction {
namespace Efficient {
Expand All @@ -15,10 +18,93 @@ namespace StreamCompaction {
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/

__global__ void kernUpsweepStep(int n, int destStride, int srcStride, int *data) {
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
int actualIdx = (idx + 1) * destStride - 1;
if (actualIdx >= n) {
return;
}
data[actualIdx] += data[actualIdx - srcStride];
}

__global__ void kernDownsweepStep(int n, int destStride, int srcStride, int* data) {
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
int actualIdx = (idx + 1) * destStride - 1;
if (actualIdx >= n) {
return;
}
int temp = data[actualIdx - srcStride];
data[actualIdx - srcStride] = data[actualIdx];
data[actualIdx] += temp;
}

void scanWithoutTimer(int n, dim3 blocksPerGrid, int* dev_data) {
// TODO

for (int d = 0; d <= ilog2ceil(n); d++) {
kernUpsweepStep << <blocksPerGrid, blockSize >> > (n, std::pow(2, d + 1), std::pow(2, d), dev_data);
cudaDeviceSynchronize();
}

int zero = 0;
cudaMemcpy(dev_data + n - 1, &zero, sizeof(int), cudaMemcpyHostToDevice);

for (int d = ilog2ceil(n); d >= 0; d--) {
kernDownsweepStep << <blocksPerGrid, blockSize >> > (n, std::pow(2, d + 1), std::pow(2, d), dev_data);
cudaDeviceSynchronize();
}
}

int closestPower(int num) {
int i = 0;
while (num > std::pow(2, i)) {
i++;
}
return std::pow(2, i);
}

int* zeros(int num) {
int *arr = (int*)malloc(num * sizeof(int));
for (int i = 0; i < num; i++) {
arr[i] = 0;
}
return arr;
}

void scan(int n, int *odata, const int *idata) {
int nPot = closestPower(n);

dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize);

int* dev_data;

cudaMalloc((void**)&dev_data, nPot * sizeof(int));
checkCUDAError("Error during cudaMalloc dev_data");

cudaMemcpy(dev_data + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("Error during cudaMemcpy idata ==> dev_data");

int* zero = zeros(n);

cudaMemcpy(dev_data, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("Error during cudaMemcpy zero ==> dev_data");

cudaDeviceSynchronize();

timer().startGpuTimer();
// TODO

scanWithoutTimer(nPot, fullBlocksPerGrid, dev_data);

timer().endGpuTimer();

cudaMemcpy(odata, dev_data + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("Error during cudaMemcpy odata");

cudaFree(dev_data);
checkCUDAError("Error during cudaFree dev_data");

free(zero);
}

/**
Expand All @@ -31,10 +117,71 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
int nPot = closestPower(n);

dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize);

int* dev_idata, * dev_bools, * dev_indices, int* dev_odata;

cudaMalloc((void**)&dev_idata, nPot * sizeof(int));
checkCUDAError("Error during cudaMalloc dev_idata");

cudaMalloc((void**)&dev_bools, nPot * sizeof(int));
checkCUDAError("Error during cudaMalloc dev_bools");

cudaMalloc((void**)&dev_indices, nPot * sizeof(int));
checkCUDAError("Error during cudaMalloc dev_indices");

cudaMalloc((void**)&dev_odata, nPot * sizeof(int));
checkCUDAError("Error during cudaMalloc dev_odata");

cudaMemcpy(dev_idata + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("Error during cudaMemcpy dev_data");

int* zero = zeros(n);

cudaMemcpy(dev_idata, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("Error during cudaMemcpy zero ==> dev_data");

cudaDeviceSynchronize();

timer().startGpuTimer();
// TODO
//// TODO
//

StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (nPot, dev_bools, dev_idata);

cudaMemcpy(dev_indices, dev_bools, nPot * sizeof(int), cudaMemcpyDeviceToDevice);
checkCUDAError("Error during cudaMemcpy dev_data");

scanWithoutTimer(nPot, fullBlocksPerGrid, dev_indices);
StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata + nPot - n, dev_idata + nPot - n, dev_bools + nPot - n, dev_indices + nPot - n);

timer().endGpuTimer();
return -1;

cudaMemcpy(odata, dev_odata + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("Error during cudaMemcpy dev_odata");

int count = 0;
int lastbool = 0;
cudaMemcpy(&count, dev_indices + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&lastbool, dev_bools + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost);

count += lastbool;

cudaFree(dev_odata);
checkCUDAError("Error during cudaFree dev_odata");

cudaFree(dev_indices);
checkCUDAError("Error during cudaFree dev_indices");

cudaFree(dev_bools);
checkCUDAError("Error during cudaFree dev_bools");

cudaFree(dev_idata);
checkCUDAError("Error during cudaFree dev_idata");

return count;
}
}
}
Loading