Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submission #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions Part1/src/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,59 @@ __global__ void generateCircularVelArray(int time, int N, glm::vec3 * arr, glm::
}
}

__device__ glm::vec3 calculateSingleAcceleration (glm::vec4 me, glm::vec4 other){
glm::vec3 outAcceleration (0,0,0);

glm::vec4 distance4 = other-me;
glm::vec3 distance (distance4.x, distance4.y, distance4.z);
float length = glm::length (distance);

if (length > 0.1f){
outAcceleration = (float(G) * other.w / (length*length)) * (distance/length);
}

return outAcceleration;
}

// TODO: Core force calc kernel global memory
// HINT : You may want to write a helper function that will help you
// calculate the acceleration contribution of a single body.
// REMEMBER : F = (G * m_a * m_b) / (r_ab ^ 2)
__device__ glm::vec3 accelerate(int N, glm::vec4 my_pos, glm::vec4 * their_pos)
{
return glm::vec3(0.0f);
glm::vec3 outAcc = calculateSingleAcceleration (my_pos, glm::vec4(0,0,0,starMass));
for (int i=0; i<N; i+=1){
outAcc += calculateSingleAcceleration (my_pos, their_pos[i]);
}
return outAcc;
}

// TODO : update the acceleration of each body
__global__ void updateF(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc)
{
// FILL IN HERE
int index = (blockIdx.x * blockDim.x) + threadIdx.x;

if (index < N){
glm::vec4 myPos;
glm::vec3 newAcceleration;

myPos = pos[index];
newAcceleration = accelerate(N, myPos, pos);
index[acc] = newAcceleration;
}
}

// TODO : update velocity and position using a simple Euler integration scheme
__global__ void updateS(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc)
{
// FILL IN HERE
int id = (blockIdx.x * blockDim.x) + threadIdx.x;

if (id<N){
vel[id] += acc[id]*dt;
pos[id].x += vel[id].x * dt;
pos[id].y += vel[id].y * dt;
pos[id].z += vel[id].z * dt;
}
}

// Update the vertex buffer object
Expand Down Expand Up @@ -180,6 +214,16 @@ void initCuda(int N)
void cudaNBodyUpdateWrapper(float dt)
{
// FILL IN HERE
dim3 fullBlocksPerGrid((int)ceil(float(numObjects)/float(blockSize)));

updateF<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel, dev_acc);
checkCUDAErrorWithLine("Kernel failed!");

updateS<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel, dev_acc);
checkCUDAErrorWithLine("Kernel failed!");

cudaThreadSynchronize ();

}

void cudaUpdateVBO(float * vbodptr, int width, int height)
Expand Down
144 changes: 144 additions & 0 deletions Part1/src/main_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#include <stdio.h>
#include <cuda.h>
#include <cmath>
#include "glm/glm.hpp"
#include <iostream>

using namespace std;

//Initialize memory, update some globals
void initCuda(int N)
{
cudaThreadSynchronize();
}

__global__ void mat_add(int n, float * A, float * B, float * out){
int index = (blockIdx.x * blockDim.x) + threadIdx.x;

if (index < n){
out[index] = A[index] + B[index];
}
}

__global__ void mat_sub(int n, float * A, float * B, float * out){
int index = (blockIdx.x * blockDim.x) + threadIdx.x;

if (index < n){
out[index] = A[index] - B[index];
}
}

__global__ void mat_mult(int n, float * A, float * B, float * out){
int row = (blockIdx.y * blockDim.y) + threadIdx.y;
int col = (blockIdx.x * blockDim.x) + threadIdx.x;

int singleDim = sqrt(float(n)); //since we can assume n is square

if (row < singleDim && col < singleDim){
float outVal = 0;
for (int i=0; i<singleDim; i+=1){
outVal += A[row * singleDim + i] * B[i * singleDim + col];
}
out[row * singleDim + col] = outVal;

}
}

void mat_add_serial(int n, float * A, float * B, float * out){
for (int i=0; i<n; i+=1){
out[i] = A[i] + B[i];
}
}

void mat_sub_serial(int n, float * A, float * B, float * out){
for (int i=0; i<n; i+=1){
out[i] = A[i] - B[i];
}
}

void mat_mult_serial(int n, float * A, float * B, float * out){
int singleDim = sqrt(float(n));
for (int row=0; row<singleDim; row+=1){
for (int col=0; col<singleDim; col+=1){
float outVal = 0;
for (int i=0; i<singleDim; i+=1){
outVal += A[row * singleDim + i] * B[i * singleDim + col];
}
out[row * singleDim + col] = outVal;
}
}
}

void printMat (float* toPrint, int dim){
int index = 0;
for (int i=0; i<dim; i+=1){
for (int j=0; j<dim; j+=1){
cout<<toPrint[index]<<",";
index += 1;
}
cout<<endl;
}
cout<<endl;
}

int main(){

float * myCPUArray1 = new float[25];
float * myCPUArray2 = new float[25];
float * outCPUArray = new float[25];

for (int i=0; i<25; i+=1){
myCPUArray1[i] = i;
myCPUArray2[i] = i;
outCPUArray[i] = i;
}

printMat (myCPUArray1, 5);
printMat (myCPUArray2, 5);

float * myGPUArray1;
float * myGPUArray2;

cudaMalloc ((void**)&myGPUArray1, 25*sizeof(float));
cudaMemcpy( myGPUArray1, myCPUArray1, 25*sizeof(float), cudaMemcpyHostToDevice);

cudaMalloc ((void**)&myGPUArray2, 25*sizeof(float));
cudaMemcpy( myGPUArray2, myCPUArray1, 25*sizeof(float), cudaMemcpyHostToDevice);

float * outGPUArray;
cudaMalloc ((void**)&outGPUArray, 25*sizeof(float));
cudaMemcpy( outGPUArray, myCPUArray1, 25*sizeof(float), cudaMemcpyHostToDevice);

int tileSize = 8;
dim3 threadsPerBlock(tileSize, tileSize);
dim3 fullBlocksPerGridSingle(25);
dim3 fullBlocksPerGridDouble(5, 5);

mat_add<<<fullBlocksPerGridSingle, threadsPerBlock>>> (25, myGPUArray1, myGPUArray2, outGPUArray);
cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();

printMat (outCPUArray, 5);

mat_sub<<<fullBlocksPerGridSingle, threadsPerBlock>>> (25, myGPUArray1, myGPUArray2, outGPUArray);
cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();

printMat (outCPUArray, 5);

cudaMemcpy( outGPUArray, myCPUArray2, 25*sizeof(float), cudaMemcpyHostToDevice);
mat_mult<<<fullBlocksPerGridDouble, threadsPerBlock>>> (25, myGPUArray1, myGPUArray2, outGPUArray);
cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();

printMat (outCPUArray, 5);

delete [] myCPUArray1;
delete [] myCPUArray2;
cudaFree (myGPUArray1);
cudaFree (myGPUArray2);
cudaFree (outGPUArray);

std::cin.ignore ();
return 0;
}
126 changes: 12 additions & 114 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,120 +1,18 @@
Project 1
=========

# Project 1 : Introduction to CUDA

## NOTE :
This project (and all other projects in this course) requires a NVIDIA graphics
card with CUDA capabilityi! Any card with compute capability 2.0 and up will
work. This means any card from the GeForce 400 and 500 series and afterwards
will work. If you do not have a machine with these specs, feel free to use
computers in the SIG Lab. All computers in SIG lab and Moore 100 C have CUDA
capable cards and should already have the CUDA SDK installed.

## PART 1 : INSTALL NSIGHT
To help with benchmarking and performance analysis, we will be using NVIDIA's
profiling and debugging tool named NSight. Download and install it from the
following link for whichever IDE you will be using:
http://www.nvidia.com/object/nsight.html.

NOTE : If you are using Linux / Mac, most of the screenshots and class usage of
NSight will be in Visual Studio. You are free to use to the Eclipse version
NSight during these in class labs, but we will not be able to help you as much.

## PART 2 : NBODY SIMULATION
To get you used to using CUDA kernels, we will be writing a simple 2D nbody
simulator. The following source files are included in the project:

* main.cpp : sets up graphics stuff for visualization
* kernel.cu : this contains the CUDA kernel calls

All the code that you will need to modify is in kernel.cu and is marked clearly
by TODOs.

## PART 3 : MATRIX MATH
In this portion we will walk you through setting up a project that writes some
simple matrix math functions. Please put this portion in a folder marked Part2
in your repository.

### Step 1 : Create your project.
Using the instructions on the Google forum, please set up a new Visual Studio project that
compiles using CUDA. For uniformity, please write your main function and all
your code in a file named matrix_math.cu.

### Step 2 : Setting up CUDA memory.
As we discussed in class, there is host memory and device memory. Host memory
is the memory that exists on the CPU, whereas device memory is memory on the
GPU.

In order to create/reserve memory on the GPU, we need to explicitly do so
using cudaMalloc. By calling cudaMalloc, we are calling malloc on the GPU to
reserve a portion of its memory. Like malloc, cudaMalloc simply mallocs a
portion of memory and returns a pointer. This memory is only accessible on the
device unless we explicitly copy memory from the GPU to the CPU. The reverse is
also true.

We can copy memory to and from the GPU using the function cudaMemcpy. Like the
POSIX C memcpy, you will need to specify the size of memory you are copying.
The last argument is used to specify the direction of the copy (from GPU to CPU
or the other way around).

Please initialize 2 5 x 5 matrices represented as an array of floats on the CPU
and the GPU where each of the entry is equal to its position (i.e. A_00 = 0,
A_01 = 1, A_44 = 24).

### Step 3 : Creating CUDA kernels.
In the previous part, we explicitly divided the CUDA kernels from the rest of
the file for stylistic purposes. Since there will be far less code in this
project, we will write the global and device functions in the same file as the
main function.

Given a matrix A and matrix B (both represented as arrays of floats), please
write the following functions :
* mat_add : A + B
* mat_sub : A - B
* mat_mult : A * B

You may assume for all matrices that the dimensions of A and B are the same and
that they are square.

Use the 2 5 x 5 matrices to test your code either by printing directly to the
console or writing an assert.

THINGS TO REMEMBER :
* global and device functions only have access to memory that is explicitly on
the device, meaning you MUST copy memory from the CPU to the GPU if you would
like to use it there
* The triple triangle braces "<<<" begin and end the global function call. This
provides parameters with which CUDA uses to set up tile size, block size and
threads for each warp.
* Do not fret if Intellisense does not understand CUDA keywords (if they have
red squiggly lines underneath CUDA keywords). There is a way to integrate
CUDA syntax highlighting into Visual Studio, but it is not the default.

### Step 4 : Write a serial version.
For comparison, write a single-threaded CPU version of mat_add, mat_sub and
mat_mult. We will not introduce timing elements in this project, but please
keep them in mind as the upcoming lab will introduce more on this topic.

## PART 4 : PERFORMANCE ANALYSIS
Since this is the first project, we will guide you with some example
questions. In future projects, please answer at least these questions, as
they go through basic performance analysis. Please go above and beyond the
questions we suggest and explore how different aspects of your code impact
performance as a whole.

We have provided a frame counter as a metric, but feel free to add cudaTimers,
etc. to do more fine-grained benchmarking of various aspects.

NOTE : Performance should be measured in comparison to a baseline. Be sure to
describe the changes you make between experiments and how you are benchmarking.

* How does changing the tile and block sizes change performance? Why?
-This is a hard question to answer. These values are dependent on both
-the specific architecture on the card, as well as how the problem is being
-distributed. One rule that you can use is that block size should generally be
-a multiple of 32, as the card will round up to a multiple of that anyways.
-Other than that, often times experimentation is required to find the best
-configuration. You do needs to have enough warps to hide latency, though.
-So to a point, increasing them will increase performance.
* How does changing the number of planets change performance? Why?
-The lower the number of planets, the faster the code will run. We're not exactly
-optimizing the simulation (lots of global memory access for example), so each
-additional body adds compute time.
* Without running experiments, how would you expect the serial and GPU verions
of matrix_math to compare? Why?

## SUBMISSION
Please commit your changes to your forked version of the repository and open a
pull request. Please write your performance analysis in your README.md.
Remember to email Harmony ([email protected]) your grade and why.
-The serial versions should be much slower than the GPU versions. The difference
-should be much more prevalent the higher the matrix dimensions grow, as well.