From 7c7f3154a15848b36a30ccdd513ebd7506bbef5f Mon Sep 17 00:00:00 2001 From: Rohith Chandran Date: Sat, 20 Sep 2014 14:30:04 -0400 Subject: [PATCH] Submission hopefully --- Part1/src/kernel.cu | 50 +++++++++++++- Part1/src/main_kernel.cu | 144 +++++++++++++++++++++++++++++++++++++++ README.md | 126 ++++------------------------------ 3 files changed, 203 insertions(+), 117 deletions(-) create mode 100644 Part1/src/main_kernel.cu diff --git a/Part1/src/kernel.cu b/Part1/src/kernel.cu index ee9b9e5..e02addf 100644 --- a/Part1/src/kernel.cu +++ b/Part1/src/kernel.cu @@ -83,25 +83,59 @@ __global__ void generateCircularVelArray(int time, int N, glm::vec3 * arr, glm:: } } +__device__ glm::vec3 calculateSingleAcceleration (glm::vec4 me, glm::vec4 other){ + glm::vec3 outAcceleration (0,0,0); + + glm::vec4 distance4 = other-me; + glm::vec3 distance (distance4.x, distance4.y, distance4.z); + float length = glm::length (distance); + + if (length > 0.1f){ + outAcceleration = (float(G) * other.w / (length*length)) * (distance/length); + } + + return outAcceleration; +} + // TODO: Core force calc kernel global memory // HINT : You may want to write a helper function that will help you // calculate the acceleration contribution of a single body. // REMEMBER : F = (G * m_a * m_b) / (r_ab ^ 2) __device__ glm::vec3 accelerate(int N, glm::vec4 my_pos, glm::vec4 * their_pos) { - return glm::vec3(0.0f); + glm::vec3 outAcc = calculateSingleAcceleration (my_pos, glm::vec4(0,0,0,starMass)); + for (int i=0; i>>(numObjects, dt, dev_pos, dev_vel, dev_acc); + checkCUDAErrorWithLine("Kernel failed!"); + + updateS<<>>(numObjects, dt, dev_pos, dev_vel, dev_acc); + checkCUDAErrorWithLine("Kernel failed!"); + + cudaThreadSynchronize (); + } void cudaUpdateVBO(float * vbodptr, int width, int height) diff --git a/Part1/src/main_kernel.cu b/Part1/src/main_kernel.cu new file mode 100644 index 0000000..be911f4 --- /dev/null +++ b/Part1/src/main_kernel.cu @@ -0,0 +1,144 @@ +#include +#include +#include +#include "glm/glm.hpp" +#include + +using namespace std; + +//Initialize memory, update some globals +void initCuda(int N) +{ + cudaThreadSynchronize(); +} + +__global__ void mat_add(int n, float * A, float * B, float * out){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n){ + out[index] = A[index] + B[index]; + } +} + +__global__ void mat_sub(int n, float * A, float * B, float * out){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n){ + out[index] = A[index] - B[index]; + } +} + +__global__ void mat_mult(int n, float * A, float * B, float * out){ + int row = (blockIdx.y * blockDim.y) + threadIdx.y; + int col = (blockIdx.x * blockDim.x) + threadIdx.x; + + int singleDim = sqrt(float(n)); //since we can assume n is square + + if (row < singleDim && col < singleDim){ + float outVal = 0; + for (int i=0; i>> (25, myGPUArray1, myGPUArray2, outGPUArray); + cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); + + printMat (outCPUArray, 5); + + mat_sub<<>> (25, myGPUArray1, myGPUArray2, outGPUArray); + cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); + + printMat (outCPUArray, 5); + + cudaMemcpy( outGPUArray, myCPUArray2, 25*sizeof(float), cudaMemcpyHostToDevice); + mat_mult<<>> (25, myGPUArray1, myGPUArray2, outGPUArray); + cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); + + printMat (outCPUArray, 5); + + delete [] myCPUArray1; + delete [] myCPUArray2; + cudaFree (myGPUArray1); + cudaFree (myGPUArray2); + cudaFree (outGPUArray); + + std::cin.ignore (); + return 0; +} diff --git a/README.md b/README.md index 70ae0d3..17f0865 100644 --- a/README.md +++ b/README.md @@ -1,120 +1,18 @@ -Project 1 -========= -# Project 1 : Introduction to CUDA - -## NOTE : -This project (and all other projects in this course) requires a NVIDIA graphics -card with CUDA capabilityi! Any card with compute capability 2.0 and up will -work. This means any card from the GeForce 400 and 500 series and afterwards -will work. If you do not have a machine with these specs, feel free to use -computers in the SIG Lab. All computers in SIG lab and Moore 100 C have CUDA -capable cards and should already have the CUDA SDK installed. - -## PART 1 : INSTALL NSIGHT -To help with benchmarking and performance analysis, we will be using NVIDIA's -profiling and debugging tool named NSight. Download and install it from the -following link for whichever IDE you will be using: -http://www.nvidia.com/object/nsight.html. - -NOTE : If you are using Linux / Mac, most of the screenshots and class usage of -NSight will be in Visual Studio. You are free to use to the Eclipse version -NSight during these in class labs, but we will not be able to help you as much. - -## PART 2 : NBODY SIMULATION -To get you used to using CUDA kernels, we will be writing a simple 2D nbody -simulator. The following source files are included in the project: - -* main.cpp : sets up graphics stuff for visualization -* kernel.cu : this contains the CUDA kernel calls - -All the code that you will need to modify is in kernel.cu and is marked clearly -by TODOs. - -## PART 3 : MATRIX MATH -In this portion we will walk you through setting up a project that writes some -simple matrix math functions. Please put this portion in a folder marked Part2 -in your repository. - -### Step 1 : Create your project. -Using the instructions on the Google forum, please set up a new Visual Studio project that -compiles using CUDA. For uniformity, please write your main function and all -your code in a file named matrix_math.cu. - -### Step 2 : Setting up CUDA memory. -As we discussed in class, there is host memory and device memory. Host memory -is the memory that exists on the CPU, whereas device memory is memory on the -GPU. - -In order to create/reserve memory on the GPU, we need to explicitly do so -using cudaMalloc. By calling cudaMalloc, we are calling malloc on the GPU to -reserve a portion of its memory. Like malloc, cudaMalloc simply mallocs a -portion of memory and returns a pointer. This memory is only accessible on the -device unless we explicitly copy memory from the GPU to the CPU. The reverse is -also true. - -We can copy memory to and from the GPU using the function cudaMemcpy. Like the -POSIX C memcpy, you will need to specify the size of memory you are copying. -The last argument is used to specify the direction of the copy (from GPU to CPU -or the other way around). - -Please initialize 2 5 x 5 matrices represented as an array of floats on the CPU -and the GPU where each of the entry is equal to its position (i.e. A_00 = 0, -A_01 = 1, A_44 = 24). - -### Step 3 : Creating CUDA kernels. -In the previous part, we explicitly divided the CUDA kernels from the rest of -the file for stylistic purposes. Since there will be far less code in this -project, we will write the global and device functions in the same file as the -main function. - -Given a matrix A and matrix B (both represented as arrays of floats), please -write the following functions : -* mat_add : A + B -* mat_sub : A - B -* mat_mult : A * B - -You may assume for all matrices that the dimensions of A and B are the same and -that they are square. - -Use the 2 5 x 5 matrices to test your code either by printing directly to the -console or writing an assert. - -THINGS TO REMEMBER : -* global and device functions only have access to memory that is explicitly on - the device, meaning you MUST copy memory from the CPU to the GPU if you would - like to use it there -* The triple triangle braces "<<<" begin and end the global function call. This - provides parameters with which CUDA uses to set up tile size, block size and - threads for each warp. -* Do not fret if Intellisense does not understand CUDA keywords (if they have - red squiggly lines underneath CUDA keywords). There is a way to integrate - CUDA syntax highlighting into Visual Studio, but it is not the default. - -### Step 4 : Write a serial version. -For comparison, write a single-threaded CPU version of mat_add, mat_sub and -mat_mult. We will not introduce timing elements in this project, but please -keep them in mind as the upcoming lab will introduce more on this topic. - -## PART 4 : PERFORMANCE ANALYSIS -Since this is the first project, we will guide you with some example -questions. In future projects, please answer at least these questions, as -they go through basic performance analysis. Please go above and beyond the -questions we suggest and explore how different aspects of your code impact -performance as a whole. - -We have provided a frame counter as a metric, but feel free to add cudaTimers, -etc. to do more fine-grained benchmarking of various aspects. - -NOTE : Performance should be measured in comparison to a baseline. Be sure to -describe the changes you make between experiments and how you are benchmarking. * How does changing the tile and block sizes change performance? Why? + -This is a hard question to answer. These values are dependent on both + -the specific architecture on the card, as well as how the problem is being + -distributed. One rule that you can use is that block size should generally be + -a multiple of 32, as the card will round up to a multiple of that anyways. + -Other than that, often times experimentation is required to find the best + -configuration. You do needs to have enough warps to hide latency, though. + -So to a point, increasing them will increase performance. * How does changing the number of planets change performance? Why? + -The lower the number of planets, the faster the code will run. We're not exactly + -optimizing the simulation (lots of global memory access for example), so each + -additional body adds compute time. * Without running experiments, how would you expect the serial and GPU verions of matrix_math to compare? Why? - -## SUBMISSION -Please commit your changes to your forked version of the repository and open a -pull request. Please write your performance analysis in your README.md. -Remember to email Harmony (harmoli+CIS565@seas.upenn.edu) your grade and why. + -The serial versions should be much slower than the GPU versions. The difference + -should be much more prevalent the higher the matrix dimensions grow, as well. \ No newline at end of file