From 7c7f3154a15848b36a30ccdd513ebd7506bbef5f Mon Sep 17 00:00:00 2001
From: Rohith Chandran <rohithX@gmail.com>
Date: Sat, 20 Sep 2014 14:30:04 -0400
Subject: [PATCH] Submission hopefully

---
 Part1/src/kernel.cu      |  50 +++++++++++++-
 Part1/src/main_kernel.cu | 144 +++++++++++++++++++++++++++++++++++++++
 README.md                | 126 ++++------------------------------
 3 files changed, 203 insertions(+), 117 deletions(-)
 create mode 100644 Part1/src/main_kernel.cu
diff --git a/Part1/src/kernel.cu b/Part1/src/kernel.cu
index ee9b9e5..e02addf 100644
--- a/Part1/src/kernel.cu
+++ b/Part1/src/kernel.cu
@@ -83,25 +83,59 @@ __global__ void generateCircularVelArray(int time, int N, glm::vec3 * arr, glm::
     }
 }
 
+__device__ glm::vec3 calculateSingleAcceleration (glm::vec4 me, glm::vec4 other){
+	glm::vec3 outAcceleration (0,0,0);
+	
+	glm::vec4 distance4 = other-me;
+	glm::vec3 distance (distance4.x, distance4.y, distance4.z);
+	float length = glm::length (distance);
+
+	if (length > 0.1f){
+		outAcceleration = (float(G) * other.w / (length*length)) * (distance/length);
+	}
+	
+	return outAcceleration;
+}
+
 // TODO: Core force calc kernel global memory
 //		 HINT : You may want to write a helper function that will help you 
 //              calculate the acceleration contribution of a single body.
 //		 REMEMBER : F = (G * m_a * m_b) / (r_ab ^ 2)
 __device__  glm::vec3 accelerate(int N, glm::vec4 my_pos, glm::vec4 * their_pos)
 {
-    return glm::vec3(0.0f);
+	glm::vec3 outAcc = calculateSingleAcceleration (my_pos, glm::vec4(0,0,0,starMass));
+	for (int i=0; i<N; i+=1){
+		outAcc += calculateSingleAcceleration (my_pos, their_pos[i]);
+	}
+    return outAcc;
 }
 
 // TODO : update the acceleration of each body
 __global__ void updateF(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc)
 {
-	// FILL IN HERE
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    
+	if (index < N){
+		glm::vec4 myPos;
+		glm::vec3 newAcceleration;
+
+		myPos = pos[index];
+		newAcceleration = accelerate(N, myPos, pos);
+		index[acc] = newAcceleration;
+	}
 }
 
 // TODO : update velocity and position using a simple Euler integration scheme
 __global__ void updateS(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc)
 {
-	// FILL IN HERE
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (id<N){
+		vel[id] += acc[id]*dt;
+		pos[id].x += vel[id].x * dt;
+        pos[id].y += vel[id].y * dt;
+        pos[id].z += vel[id].z * dt;
+	}
 }
 
 // Update the vertex buffer object
@@ -180,6 +214,16 @@ void initCuda(int N)
 void cudaNBodyUpdateWrapper(float dt)
 {
 	// FILL IN HERE
+	dim3 fullBlocksPerGrid((int)ceil(float(numObjects)/float(blockSize)));
+	
+	updateF<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel, dev_acc);
+    checkCUDAErrorWithLine("Kernel failed!");
+
+	updateS<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel, dev_acc);
+    checkCUDAErrorWithLine("Kernel failed!");
+
+	cudaThreadSynchronize ();
+
 }
 
 void cudaUpdateVBO(float * vbodptr, int width, int height)
diff --git a/Part1/src/main_kernel.cu b/Part1/src/main_kernel.cu
new file mode 100644
index 0000000..be911f4
--- /dev/null
+++ b/Part1/src/main_kernel.cu
@@ -0,0 +1,144 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <cmath>
+#include "glm/glm.hpp"
+#include <iostream>
+
+using namespace std;
+
+//Initialize memory, update some globals
+void initCuda(int N)
+{   
+	cudaThreadSynchronize();
+}
+
+__global__ void mat_add(int n, float * A, float * B, float * out){
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index < n){
+	  out[index] = A[index] + B[index];
+  }
+}
+
+__global__ void mat_sub(int n, float * A, float * B, float * out){
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index < n){
+	  out[index] = A[index] - B[index];
+  }
+}
+
+__global__ void mat_mult(int n, float * A, float * B, float * out){
+  int row = (blockIdx.y * blockDim.y) + threadIdx.y;
+  int col = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  int singleDim = sqrt(float(n)); //since we can assume n is square
+
+  if (row < singleDim && col < singleDim){
+	  float outVal = 0;
+	  for (int i=0; i<singleDim; i+=1){
+		 outVal += A[row * singleDim + i] * B[i * singleDim + col];
+	  }
+	  out[row * singleDim + col] = outVal;
+
+  }
+}
+
+void mat_add_serial(int n, float * A, float * B, float * out){
+	for (int i=0; i<n; i+=1){
+		out[i] = A[i] + B[i];
+	}
+}
+
+void mat_sub_serial(int n, float * A, float * B, float * out){
+	for (int i=0; i<n; i+=1){
+		out[i] = A[i] - B[i];
+	}
+}
+
+void mat_mult_serial(int n, float * A, float * B, float * out){
+	int singleDim = sqrt(float(n));
+	for (int row=0; row<singleDim; row+=1){
+		for (int col=0; col<singleDim; col+=1){
+			float outVal = 0;
+			for (int i=0; i<singleDim; i+=1){
+				outVal += A[row * singleDim + i] * B[i * singleDim + col];
+			}
+			out[row * singleDim + col] = outVal;
+		}
+	}
+}
+
+void printMat (float* toPrint, int dim){
+	int index = 0;
+	for (int i=0; i<dim; i+=1){
+		for (int j=0; j<dim; j+=1){
+			cout<<toPrint[index]<<",";
+			index += 1;
+		}
+		cout<<endl;
+	}
+	cout<<endl;
+}
+
+int main(){
+
+	float * myCPUArray1 = new float[25];
+	float * myCPUArray2 = new float[25];
+	float * outCPUArray = new float[25];
+
+	for (int i=0; i<25; i+=1){
+		myCPUArray1[i] = i;
+		myCPUArray2[i] = i;
+		outCPUArray[i] = i;
+	}
+
+	printMat (myCPUArray1, 5);
+	printMat (myCPUArray2, 5);
+
+	float * myGPUArray1;
+	float * myGPUArray2;
+
+	cudaMalloc ((void**)&myGPUArray1, 25*sizeof(float));
+	cudaMemcpy( myGPUArray1, myCPUArray1, 25*sizeof(float), cudaMemcpyHostToDevice);
+
+	cudaMalloc ((void**)&myGPUArray2, 25*sizeof(float));
+	cudaMemcpy( myGPUArray2, myCPUArray1, 25*sizeof(float), cudaMemcpyHostToDevice);
+	
+	float * outGPUArray;
+	cudaMalloc ((void**)&outGPUArray, 25*sizeof(float));
+	cudaMemcpy( outGPUArray, myCPUArray1, 25*sizeof(float), cudaMemcpyHostToDevice);
+
+	int tileSize = 8;
+	dim3 threadsPerBlock(tileSize, tileSize);
+	dim3 fullBlocksPerGridSingle(25);
+	dim3 fullBlocksPerGridDouble(5, 5);
+
+	mat_add<<<fullBlocksPerGridSingle, threadsPerBlock>>> (25, myGPUArray1, myGPUArray2, outGPUArray);
+	cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	printMat (outCPUArray, 5);
+
+	mat_sub<<<fullBlocksPerGridSingle, threadsPerBlock>>> (25, myGPUArray1, myGPUArray2, outGPUArray);
+	cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	printMat (outCPUArray, 5);
+
+	cudaMemcpy( outGPUArray, myCPUArray2, 25*sizeof(float), cudaMemcpyHostToDevice);
+	mat_mult<<<fullBlocksPerGridDouble, threadsPerBlock>>> (25, myGPUArray1, myGPUArray2, outGPUArray);
+	cudaMemcpy( outCPUArray, outGPUArray, 25*sizeof(float), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	printMat (outCPUArray, 5);
+
+	delete [] myCPUArray1;
+	delete [] myCPUArray2;
+	cudaFree (myGPUArray1);
+	cudaFree (myGPUArray2);
+	cudaFree (outGPUArray);
+
+	std::cin.ignore ();
+	return 0;
+}
diff --git a/README.md b/README.md
index 70ae0d3..17f0865 100644
--- a/README.md
+++ b/README.md
@@ -1,120 +1,18 @@
-Project 1
-=========
 
-# Project 1 : Introduction to CUDA
-
-## NOTE :
-This project (and all other projects in this course) requires a NVIDIA graphics
-card with CUDA capabilityi!  Any card with compute capability 2.0 and up will
-work.  This means any card from the GeForce 400 and 500 series and afterwards
-will work.  If you do not have a machine with these specs, feel free to use
-computers in the SIG Lab.  All computers in SIG lab and Moore 100 C have CUDA 
-capable cards and should already have the CUDA SDK installed. 
-
-## PART 1 : INSTALL NSIGHT
-To help with benchmarking and performance analysis, we will be using NVIDIA's
-profiling and debugging tool named NSight. Download and install it from the
-following link for whichever IDE you will be using:
-http://www.nvidia.com/object/nsight.html. 
-
-NOTE : If you are using Linux / Mac, most of the screenshots and class usage of
-NSight will be in Visual Studio.  You are free to use to the Eclipse version
-NSight during these in class labs, but we will not be able to help you as much.
-
-## PART 2 : NBODY SIMULATION
-To get you used to using CUDA kernels, we will be writing a simple 2D nbody 
-simulator.  The following source files are included in the project:
-
-* main.cpp : sets up graphics stuff for visualization
-* kernel.cu : this contains the CUDA kernel calls
-
-All the code that you will need to modify is in kernel.cu and is marked clearly
-by TODOs.
-
-## PART 3 : MATRIX MATH
-In this portion we will walk you through setting up a project that writes some
-simple matrix math functions. Please put this portion in a folder marked Part2
-in your repository. 
-
-### Step 1 : Create your project.
-Using the instructions on the Google forum, please set up a new Visual Studio project that
-compiles using CUDA. For uniformity, please write your main function and all
-your code in a file named matrix_math.cu.
-
-### Step 2 : Setting up CUDA memory.
-As we discussed in class, there is host memory and device memory.  Host memory
-is the memory that exists on the CPU, whereas device memory is memory on the
-GPU.  
-
-In order to create/reserve memory on the GPU, we need to explicitly do so
-using cudaMalloc.  By calling cudaMalloc, we are calling malloc on the GPU to
-reserve a portion of its memory.  Like malloc, cudaMalloc simply mallocs a
-portion of memory and returns a pointer. This memory is only accessible on the
-device unless we explicitly copy memory from the GPU to the CPU.  The reverse is
-also true.  
-
-We can copy memory to and from the GPU using the function cudaMemcpy. Like the
-POSIX C memcpy, you will need to specify the size of memory you are copying.
-The last argument is used to specify the direction of the copy (from GPU to CPU
-or the other way around).
-
-Please initialize 2 5 x 5 matrices represented as an array of floats on the CPU
-and the GPU where each of the entry is equal to its position (i.e. A_00 = 0,
-A_01 = 1, A_44 = 24). 
-
-### Step 3 : Creating CUDA kernels. 
-In the previous part, we explicitly divided the CUDA kernels from the rest of
-the file for stylistic purposes.  Since there will be far less code in this
-project, we will write the global and device functions in the same file as the
-main function.
-
-Given a matrix A and matrix B (both represented as arrays of floats), please
-write the following functions :
-* mat_add : A + B
-* mat_sub : A - B
-* mat_mult : A * B
-
-You may assume for all matrices that the dimensions of A and B are the same and
-that they are square.
-
-Use the 2 5 x 5 matrices to test your code either by printing directly to the
-console or writing an assert.
-
-THINGS TO REMEMBER :
-* global and device functions only have access to memory that is explicitly on
-  the device, meaning you MUST copy memory from the CPU to the GPU if you would
-  like to use it there
-* The triple triangle braces "<<<" begin and end the global function call.  This
-  provides parameters with which CUDA uses to set up tile size, block size and
-  threads for each warp.
-* Do not fret if Intellisense does not understand CUDA keywords (if they have
-  red squiggly lines underneath CUDA keywords).  There is a way to integrate
-  CUDA syntax highlighting into Visual Studio, but it is not the default.
-
-### Step 4 : Write a serial version.
-For comparison, write a single-threaded CPU version of mat_add, mat_sub and
-mat_mult. We will not introduce timing elements in this project, but please
-keep them in mind as the upcoming lab will introduce more on this topic. 
-
-## PART 4 : PERFORMANCE ANALYSIS
-Since this is the first project, we will guide you with some example
-questions.  In future projects, please answer at least these questions, as
-they go through basic performance analysis.  Please go above and beyond the
-questions we suggest and explore how different aspects of your code impact
-performance as a whole. 
-
-We have provided a frame counter as a metric, but feel free to add cudaTimers,
-etc. to do more fine-grained benchmarking of various aspects. 
-
-NOTE : Performance should be measured in comparison to a baseline.  Be sure to
-describe the changes you make between experiments and how you are benchmarking.
 
 * How does changing the tile and block sizes change performance? Why?
+  -This is a hard question to answer.  These values are dependent on both
+  -the specific architecture on the card, as well as how the problem is being
+  -distributed.  One rule that you can use is that block size should generally be
+  -a multiple of 32, as the card will round up to a multiple of that anyways.
+  -Other than that, often times experimentation is required to find the best
+  -configuration.  You do needs to have enough warps to hide latency, though.
+  -So to a point, increasing them will increase performance.
 * How does changing the number of planets change performance? Why?
+  -The lower the number of planets, the faster the code will run.  We're not exactly
+  -optimizing the simulation (lots of global memory access for example), so each 
+  -additional body adds compute time.
 * Without running experiments, how would you expect the serial and GPU verions
   of matrix_math to compare?  Why?
-
-## SUBMISSION
-Please commit your changes to your forked version of the repository and open a
-pull request.  Please write your performance analysis in your README.md.
-Remember to email Harmony (harmoli+CIS565@seas.upenn.edu) your grade and why.
+  -The serial versions should be much slower than the GPU versions.  The difference
+  -should be much more prevalent the higher the matrix dimensions grow, as well.
\ No newline at end of file