From c12207cb630c25a5ac16ecf66e8d3fafcd76979e Mon Sep 17 00:00:00 2001 From: Arib Alam Date: Wed, 22 Apr 2020 21:58:58 +0530 Subject: [PATCH 1/4] reverted back to original --- main.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/main.sh b/main.sh index 4c0e928..c9b4001 100644 --- a/main.sh +++ b/main.sh @@ -3,15 +3,11 @@ mkdir weights rm -rf build rm -rf data rm -rf eval -virtualenv env -source env/bin/activate -pip install -r requirements.txt cd processing python resize.py bash get_standard_annotations.sh cd ../scripts python create_targets.py standard -deactivate cd .. mkdir weights mkdir build From c5fbd6920de3d399758aa381af45217844148e17 Mon Sep 17 00:00:00 2001 From: Arib Alam Date: Sun, 24 May 2020 04:56:42 +0530 Subject: [PATCH 2/4] removed print statements --- include/data_utils.h | 2 +- src/model.cu | 343 +++++++++++++++---------------------------- 2 files changed, 123 insertions(+), 222 deletions(-) diff --git a/include/data_utils.h b/include/data_utils.h index b329f01..94ca891 100644 --- a/include/data_utils.h +++ b/include/data_utils.h @@ -81,7 +81,7 @@ float* get_float_array(string filename_) { // printf("%f\n", result); } - printf("%f Sum of read array, %lu\n", sum, arr.size()); + // printf("%f Sum of read array, %lu\n", sum, arr.size()); float *out = (float *)malloc(sizeof(float) * arr.size()); for(int i = 0;i < arr.size();i++) out[i] = arr[i]; diff --git a/src/model.cu b/src/model.cu index 3a5fa4d..b4dccaa 100644 --- a/src/model.cu +++ b/src/model.cu @@ -84,18 +84,6 @@ static inline unsigned int RoundUp(unsigned int nominator, unsigned int denomina #include "../include/sigmoid.h" #include "../include/data_utils.h" -void pprint(float* matrix, int size, int width) -{ - for(int i=0; i>>(grad_data, s9_out, target, mask, batch_size); - float loss = MSELoss(cpu_out, cpu_target, cpu_mask, c9.output_size); - - printf("Loss Value : %f\n", loss); - - /* Backward Propagation from s9 layer, assuming NxCx13x13 shaped upstream gradient available */ - // checkCudaErrors(cudaMemcpyAsync(grad_data, grad_data_cpu, sizeof(float) * s9.output_size, cudaMemcpyHostToDevice)); - s9.backward(grad_data, s9_dout); - c9.backward(s9_dout, c9.input_descriptor, r8_out); + c5.forward(r4_out, c5_out); + r5.forward(c5_out, r5_out); + m5.forward(r5_out, m5_out); + c6.forward(m5_out, c6_out); + r6.forward(c6_out, r6_out); + m6.forward(r6_out, m6_out); + c7.forward(m6_out, c7_out); + r7.forward(c7_out, r7_out); + c8.forward(r7_out, c8_out); + r8.forward(c8_out, r8_out); + c9.forward(r8_out, c9_out); + s9.forward(c9_out, s9_out); + checkCudaErrors(cudaMemcpy(cpu_out, s9_out, sizeof(float) * c9.output_size, cudaMemcpyDeviceToHost)); + + /* Calculate Loss using the final NxCx13x13 shaped cpu_out tensor */ + // + // TO BE WRITTEN + MSELossBackprop<<>>(grad_data, s9_out, target, mask, batch_size); + float loss = MSELoss(cpu_out, cpu_target, cpu_mask, c9.output_size); + + printf("Loss Value : %f\n", loss); + + /* Backward Propagation from s9 layer, assuming NxCx13x13 shaped upstream gradient available */ + s9.backward(grad_data, s9_dout); + c9.backward(s9_dout, c9.input_descriptor, r8_out); + + r8.backward(c9.grad_data, r8_dout); + c8.backward(r8_dout, c8.input_descriptor, r7_out); - r8.backward(c9.grad_data, r8_dout); - c8.backward(r8_dout, c8.input_descriptor, r7_out); - - r7.backward(c8.grad_data, r7_dout); - c7.backward(r7_dout, c7.input_descriptor, m6_out); - - m6.backward(r6_out, c7.grad_data, m6_out, m6_dout); - r6.backward(m6_dout, r6_dout); - c6.backward(r6_dout, c6.input_descriptor, m5_out); - - m5.backward(r5_out, c6.grad_data, m5_out, m5_dout); - r5.backward(m5_dout, r5_dout); - c5.backward(r5_dout, c5.input_descriptor, r4_out); - - r4.backward(c5.grad_data, r4_dout); - c4.backward(r4_dout, c4.input_descriptor, r3_out); - - r3.backward(c4.grad_data, r3_dout); - c3.backward(r3_dout, c3.input_descriptor, m2_out); - - m2.backward(r2_out, c3.grad_data, m2_out, m2_dout); - r2.backward(m2_dout, r2_dout); - c2.backward(r2_dout, c2.input_descriptor, m1_out); - - m1.backward(r1_out, c2.grad_data, m1_out, m1_dout); - r1.backward(m1_dout, r1_dout); - c1.backward(r1_dout, c1.input_descriptor, data); - - int t; - t = c9.output_size; - float *gd = (float *)malloc(sizeof(float) * t); - checkCudaErrors(cudaMemcpy(gd, grad_data, sizeof(float) * t, cudaMemcpyDeviceToHost)); - std::cout << "Printing grad data . . . \n"; - float sum_ = 0.0; - for(int i = 0;i < t;i++) - sum_ += gd[i]; - std::cout << sum_ << std::endl; - std::cout << std::endl; - - - std::cout << "Printing target . . . \n"; - sum_ = 0.0; - for(int i = 0;i < t;i++) - sum_ += cpu_target[i]; - std::cout << sum_ << std::endl; - std::cout << std::endl; - - std::cout << "Printing mask . . . \n"; - sum_ = 0.0; - for(int i = 0;i < t;i++) - sum_ += cpu_mask[i]; - std::cout << sum_ << std::endl; - std::cout << std::endl; - - t = c9.in_channels * c9.kernel_size * c9.kernel_size * c9.out_channels; - float *grad_kernel = (float *)malloc(sizeof(float) * t); - checkCudaErrors(cudaMemcpy(grad_kernel, c9.grad_kernel, sizeof(float) * t, cudaMemcpyDeviceToHost)); - - std::cout<<"Printing grad_kernels . . .\n"; - sum_ = 0.0; - for(int i = 0;i < t;i++) - sum_ += grad_kernel[i]; - std::cout << sum_ << std::endl; - std::cout << std::endl; + r7.backward(c8.grad_data, r7_dout); + c7.backward(r7_dout, c7.input_descriptor, m6_out); - t = c9.out_channels; - float *grad_bias = (float *)malloc(sizeof(float) * t); - checkCudaErrors(cudaMemcpy(grad_bias, c9.grad_bias, sizeof(float) * t, cudaMemcpyDeviceToHost)); - - // printf("Backward Pass Done!\n\n\n"); - - std::cout<<"Printing grad_bias . . .\n"; - sum_ = 0.0; - for(int i = 0;i < t;i++) - sum_ += grad_bias[i]; - std::cout << sum_ << std::endl; - std::cout << std::endl; + m6.backward(r6_out, c7.grad_data, m6_out, m6_dout); + r6.backward(m6_dout, r6_dout); + c6.backward(r6_dout, c6.input_descriptor, m5_out); - t = c9.in_channels * c9.kernel_size * c9.kernel_size * c9.out_channels; - float *ker = (float *)malloc(sizeof(float) * t); - checkCudaErrors(cudaMemcpy(ker, c9.param_kernel, sizeof(float) * t, cudaMemcpyDeviceToHost)); - - std::cout<<"Printing kernel of c9 . . .\n"; - sum_ = 0.0; - for(int i = 0;i < t;i++) - sum_ += ker[i]; - std::cout << sum_ << std::endl; - std::cout << std::endl; - - - - /* Update weights i.e., step*/ - c9.updateWeights(learning_rate); - c8.updateWeights(learning_rate); - c7.updateWeights(learning_rate); - c6.updateWeights(learning_rate); - c5.updateWeights(learning_rate); - c4.updateWeights(learning_rate); - c3.updateWeights(learning_rate); - c2.updateWeights(learning_rate); - c1.updateWeights(learning_rate); - - //Check if kernel weights are getting updated - // t = c1.in_channels * c1.kernel_size * c1.kernel_size * c1.out_channels; - // float *kernel = (float *)malloc(sizeof(float) * t); - // checkCudaErrors(cudaMemcpy(kernel, c1.param_kernel, sizeof(float) * t, cudaMemcpyDeviceToHost)); - // std::cout<<"Printing kernel of c1 after update. . .\n"; - // for(int i = 0;i < t; i++) - // std::cout << kernel[i] << " "; - // std::cout << std::endl; + m5.backward(r5_out, c6.grad_data, m5_out, m5_dout); + r5.backward(m5_dout, r5_dout); + c5.backward(r5_dout, c5.input_descriptor, r4_out); - free(cpu_data); - free(cpu_target); - free(cpu_mask); - /* Training Loop Ends Here! */ - - /* Save trained weights */ - if(iter%SAVE_FREQUENCY == 0) { - string save_path = "../weights/"; - c9.save_params(str_to_char_arr(save_path + + "9")); - c8.save_params(str_to_char_arr(save_path + + "8")); - c7.save_params(str_to_char_arr(save_path + + "7")); - c6.save_params(str_to_char_arr(save_path + + "6")); - c5.save_params(str_to_char_arr(save_path + + "5")); - c4.save_params(str_to_char_arr(save_path + + "4")); - c3.save_params(str_to_char_arr(save_path + + "3")); - c2.save_params(str_to_char_arr(save_path + + "2")); - c1.save_params(str_to_char_arr(save_path + + "1")); - } + r4.backward(c5.grad_data, r4_dout); + c4.backward(r4_dout, c4.input_descriptor, r3_out); + + r3.backward(c4.grad_data, r3_dout); + c3.backward(r3_dout, c3.input_descriptor, m2_out); + + m2.backward(r2_out, c3.grad_data, m2_out, m2_dout); + r2.backward(m2_dout, r2_dout); + c2.backward(r2_dout, c2.input_descriptor, m1_out); + + m1.backward(r1_out, c2.grad_data, m1_out, m1_dout); + r1.backward(m1_dout, r1_dout); + c1.backward(r1_dout, c1.input_descriptor, data); + + /* Update weights i.e., step*/ + c9.updateWeights(learning_rate); + c8.updateWeights(learning_rate); + c7.updateWeights(learning_rate); + c6.updateWeights(learning_rate); + c5.updateWeights(learning_rate); + c4.updateWeights(learning_rate); + c3.updateWeights(learning_rate); + c2.updateWeights(learning_rate); + c1.updateWeights(learning_rate); + + free(cpu_data); + free(cpu_target); + free(cpu_mask); + /* Training Loop Ends Here! */ + + /* Save trained weights */ + if(iter%SAVE_FREQUENCY == 0) { + string save_path = "../weights/"; + c9.save_params(str_to_char_arr(save_path + + "9")); + c8.save_params(str_to_char_arr(save_path + + "8")); + c7.save_params(str_to_char_arr(save_path + + "7")); + c6.save_params(str_to_char_arr(save_path + + "6")); + c5.save_params(str_to_char_arr(save_path + + "5")); + c4.save_params(str_to_char_arr(save_path + + "4")); + c3.save_params(str_to_char_arr(save_path + + "3")); + c2.save_params(str_to_char_arr(save_path + + "2")); + c1.save_params(str_to_char_arr(save_path + + "1")); + } } printf("Done\n\n\n"); From 9747319f471c2243146454c086c76f51e5e85803 Mon Sep 17 00:00:00 2001 From: Arib Alam Date: Sun, 24 May 2020 05:11:06 +0530 Subject: [PATCH 3/4] added variables and memory allocation for momentum variables --- include/convolution.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/include/convolution.h b/include/convolution.h index 5ad23cc..32d794d 100644 --- a/include/convolution.h +++ b/include/convolution.h @@ -6,6 +6,10 @@ class Conv // alpha and beta are scaling constants for the operations, use these default values const float alpha = 1.0f; const float beta = 0.0f; + const float beta1 = 0.9f; + const float beta2 = 0.999f; + const float one_minus_beta1 = 1 - beta1; + const float one_minus_beta2 = 1 - beta2; /* Tensor Descriptors for our operation */ cudnnTensorDescriptor_t input_descriptor; @@ -36,10 +40,25 @@ class Conv float *grad_bias; float *grad_data; // gradient with respect input of convolution, Note : INPUT + // Momentum variables on GPU + float* v_kernel; + float* s_kernel; + + float* v_bias; + float* s_bias; + /*** These variables are on CPU ***/ std::vector cpu_param_kernel; std::vector cpu_param_bias; + // momentum variables on CPU + std::vector cpu_v_kernel; + std::vector cpu_s_kernel; + + std::vector cpu_v_bias; + std::vector cpu_s_bias; + + /*** Definition variables we would be using ***/ int input_size; int output_size; @@ -197,6 +216,13 @@ class Conv // Gradient with respect to output has same shape as output checkCudaErrors(cudaMalloc(&grad_data, sizeof(float) * batch_size * out_height * out_width * out_channels)); + // Allocate memory for momentum variables + checkCudaErrors(cudaMalloc(&v_kernel, sizeof(float) * in_channels * kernel_size * kernel_size * out_channels)); + checkCudaErrors(cudaMalloc(&s_kernel, sizeof(float) * in_channels * kernel_size * kernel_size * out_channels)); + + checkCudaErrors(cudaMalloc(&v_bias, sizeof(float) * out_channels)); + checkCudaErrors(cudaMalloc(&s_bias, sizeof(float) * out_channels)); + input_size = batch_size * height * width * in_channels; output_size = batch_size * out_height * out_width * out_channels; @@ -204,6 +230,13 @@ class Conv cpu_param_kernel = std::vector(in_channels * kernel_size * kernel_size * out_channels, 0); cpu_param_bias = std::vector(out_channels, 0); //BIAS INIT TO ZERO! + + cpu_v_kernel = std::vector(in_channels * kernel_size * kernel_size * out_channels, 0); + cpu_s_kernel = std::vector(in_channels * kernel_size * kernel_size * out_channels, 0); + + cpu_v_bias = std::vector(out_channels, 0); //BIAS INIT TO ZERO! + cpu_s_bias = std::vector(out_channels, 0); //BIAS INIT TO ZERO! + // Initialize Parameters on GPU init_weights(); //init_test_weights(); @@ -211,6 +244,12 @@ class Conv // Move Initialized Weights to GPU checkCudaErrors(cudaMemcpyAsync(param_kernel, &cpu_param_kernel[0], sizeof(float) * cpu_param_kernel.size(), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpyAsync(param_bias, &cpu_param_bias[0], sizeof(float) * cpu_param_bias.size(), cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemcpyAsync(v_kernel, &cpu_v_kernel[0], sizeof(float) * cpu_v_kernel.size(), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyAsync(s_kernel, &cpu_s_kernel[0], sizeof(float) * cpu_s_kernel.size(), cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemcpyAsync(v_bias, &cpu_v_bias[0], sizeof(float) * cpu_v_bias.size(), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyAsync(s_bias, &cpu_s_bias[0], sizeof(float) * cpu_s_bias.size(), cudaMemcpyHostToDevice)); } // Destructor for de-allocating memory From 86c1497abe8a02ee86bbc4757f75c574896c61c4 Mon Sep 17 00:00:00 2001 From: Arib Alam Date: Sun, 24 May 2020 06:57:36 +0530 Subject: [PATCH 4/4] added adam optimization --- include/convolution.h | 71 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/include/convolution.h b/include/convolution.h index 32d794d..3cc3992 100644 --- a/include/convolution.h +++ b/include/convolution.h @@ -1,3 +1,26 @@ +__global__ void square(float* out, float* in, int sz) +{ + + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id >= sz) + return; + + out[id] = in[id] * in[id]; + +} + +__global__ void computeMoment(float* out, float* v, float* s, int sz) +{ + + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id >= sz) + return; + + out[id] = v[id] / (sqrtf(s[id]) + 10e-8); + +} + + class Conv { /*Convolution layer class*/ @@ -40,6 +63,10 @@ class Conv float *grad_bias; float *grad_data; // gradient with respect input of convolution, Note : INPUT + // square of grads for Adam's Optimization + float* grad_kernel_sq; + float* grad_bias_sq; + // Momentum variables on GPU float* v_kernel; float* s_kernel; @@ -47,6 +74,9 @@ class Conv float* v_bias; float* s_bias; + // float* mom_kernel; + // float* mom_bias; + /*** These variables are on CPU ***/ std::vector cpu_param_kernel; std::vector cpu_param_bias; @@ -213,6 +243,8 @@ class Conv checkCudaErrors(cudaMalloc(¶m_bias, sizeof(float) * out_channels)); checkCudaErrors(cudaMalloc(&grad_kernel, sizeof(float) * in_channels * kernel_size * kernel_size * out_channels)); checkCudaErrors(cudaMalloc(&grad_bias, sizeof(float) * out_channels)); + checkCudaErrors(cudaMalloc(&grad_kernel_sq, sizeof(float) * in_channels * kernel_size * kernel_size * out_channels)); + checkCudaErrors(cudaMalloc(&grad_bias_sq, sizeof(float) * out_channels)); // Gradient with respect to output has same shape as output checkCudaErrors(cudaMalloc(&grad_data, sizeof(float) * batch_size * out_height * out_width * out_channels)); @@ -223,6 +255,9 @@ class Conv checkCudaErrors(cudaMalloc(&v_bias, sizeof(float) * out_channels)); checkCudaErrors(cudaMalloc(&s_bias, sizeof(float) * out_channels)); + // checkCudaErrors(cudaMalloc(&mom_kernel, sizeof(float) * in_channels * kernel_size * kernel_size * out_channels)); + // checkCudaErrors(cudaMalloc(&mom_bias, sizeof(float) * out_channels)); + input_size = batch_size * height * width * in_channels; output_size = batch_size * out_height * out_width * out_channels; @@ -355,11 +390,45 @@ class Conv */ checkCUDNN(cudnnConvolutionBackwardBias(cudnn, &alpha, output_descriptor, data_grad_above, &beta, bias_descriptor, grad_bias)); // correct! - if(falgo) + + // update the momentum variables + + // update velocity + checkCudaErrors(cublasSscal(cublas, out_channels, &beta1, v_bias, 1)); + checkCudaErrors(cublasSaxpy(cublas, out_channels, &one_minus_beta1, grad_bias, 1, v_bias, 1)); + + square<<>>(grad_bias_sq, grad_bias, out_channels); + + // update mean square + checkCudaErrors(cublasSscal(cublas, out_channels, &beta2, s_bias, 1)); + checkCudaErrors(cublasSaxpy(cublas, out_channels, &one_minus_beta2, grad_bias_sq, 1, s_bias, 1)); + + // update momentum + computeMoment<<>>(grad_bias, v_bias, s_bias, out_channels); + + if(falgo) { checkCUDNN(cudnnConvolutionBackwardFilter(cudnn, &alpha, tensor_below, data_below, output_descriptor, data_grad_above, convolution_descriptor, convbwfalgo, d_workspace, m_workspaceSize, &beta, kernel_descriptor, grad_kernel)); // workspace ka dekhna, baaki correct hai! + + // update the momentum variables + int ks = in_channels * kernel_size * kernel_size * out_channels; + + // update velocity + checkCudaErrors(cublasSscal(cublas, ks, &beta1, v_kernel, 1)); + checkCudaErrors(cublasSaxpy(cublas, ks, &one_minus_beta1, grad_kernel, 1, v_kernel, 1)); + + square<<>>(grad_kernel_sq, grad_kernel, ks); + + // update mean square + checkCudaErrors(cublasSscal(cublas, ks, &beta2, s_kernel, 1)); + checkCudaErrors(cublasSaxpy(cublas, ks, &one_minus_beta2, grad_kernel_sq, 1, s_kernel, 1)); + + // update momentum + computeMoment<<>>(grad_kernel, v_kernel, s_kernel, ks); + + } if(dalgo) checkCUDNN(cudnnConvolutionBackwardData(cudnn, &alpha, kernel_descriptor,