From a929ca2058aa3e17ef9effa4260304e6dcda6a19 Mon Sep 17 00:00:00 2001
From: akrizhevsky <akrizhevsky@gmail.com>
Date: Tue, 17 Jul 2012 22:40:00 +0000
Subject: [PATCH] Fixed bug in contrast normalization backpropagation code
 which caused wrong gradients to be computed near image borders.

---
 data.py                         | 17 ++++++++++-------
 gpumodel.py                     | 14 +++++++++-----
 include/cudaconv2/conv_util.cuh | 22 ++++++++++------------
 src/cudaconv2/conv_util.cu      | 18 +++++++++++++-----
 src/layer.cu                    |  4 ++--
 src/nvmatrix/nvmatrix.cu        |  3 ++-
 6 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/data.py b/data.py
index b01f8af..cdfd661 100644
--- a/data.py
+++ b/data.py
@@ -45,7 +45,7 @@ def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None,
         self.batch_meta = self.get_batch_meta(data_dir)
         self.data_dic = None
         self.test = test
-        self.batch_range_idx = batch_range.index(init_batchnum)
+        self.batch_idx = batch_range.index(init_batchnum)
 
     def get_next_batch(self):
         if self.data_dic is None or len(self.batch_range) > 1:
@@ -84,13 +84,13 @@ def get_data_dims(self):
         return self.batch_meta['num_vis']
     
     def advance_batch(self):
-        self.batch_range_idx = self.get_next_batch_idx()
-        self.curr_batchnum = self.batch_range[self.batch_range_idx]
-        if self.batch_range_idx == 0: # we wrapped
+        self.batch_idx = self.get_next_batch_idx()
+        self.curr_batchnum = self.batch_range[self.batch_idx]
+        if self.batch_idx == 0: # we wrapped
             self.curr_epoch += 1
             
     def get_next_batch_idx(self):
-        return (self.batch_range_idx + 1) % len(self.batch_range)
+        return (self.batch_idx + 1) % len(self.batch_range)
     
     def get_next_batch_num(self):
         return self.batch_range[self.get_next_batch_idx()]
@@ -150,7 +150,7 @@ def __init__(self, data_dim):
         self.batch_meta = {'num_vis': data_dim, 'data_in_rows':True}
         self.curr_epoch = 1
         self.curr_batchnum = 1
-        self.batch_range_idx = 0
+        self.batch_idx = 0
         
     def get_next_batch(self):
         epoch,  batchnum = self.curr_epoch, self.curr_batchnum
@@ -170,7 +170,7 @@ def __init__(self, data_dim, num_classes=10, num_cases=512):
         self.num_classes = num_classes
         self.curr_epoch = 1
         self.curr_batchnum = 1
-        self.batch_range_idx=0
+        self.batch_idx=0
         
     def get_num_classes(self):
         return self.num_classes
@@ -197,6 +197,9 @@ def get_next_batch(self):
         return epoch, batchnum, self.data_dic[batchnum - self.batch_range[0]]
 
 class LabeledDataProvider(DataProvider):   
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
+        DataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        
     def get_num_classes(self):
         return len(self.batch_meta['label_names'])
     
diff --git a/gpumodel.py b/gpumodel.py
index dfc5b34..711bda8 100644
--- a/gpumodel.py
+++ b/gpumodel.py
@@ -225,12 +225,16 @@ def get_test_error(self):
         while True:
             data = next_data
             self.start_batch(data, train=False)
-            if not self.test_one and data[1] < self.test_batch_range[-1]: # load next batch
+            load_next = not self.test_one and data[1] < self.test_batch_range[-1]
+            if load_next: # load next batch
                 next_data = self.get_next_batch(train=False)
-                test_outputs += [self.finish_batch()]
-            else:
-                test_outputs += [self.finish_batch()]
-                break            
+            test_outputs += [self.finish_batch()]
+            if self.test_only: # Print the individual batch results for safety
+                print "batch %d: %s" % (data[1], str(test_outputs[-1]))
+            if not load_next:
+                break
+            sys.stdout.flush()
+            
         return self.aggregate_test_outputs(test_outputs)
     
     def set_var(self, var_name, var_val):
diff --git a/include/cudaconv2/conv_util.cuh b/include/cudaconv2/conv_util.cuh
index 4eb00c1..040387c 100644
--- a/include/cudaconv2/conv_util.cuh
+++ b/include/cudaconv2/conv_util.cuh
@@ -69,19 +69,15 @@ void convResponseNormCrossMap(NVMatrix& images, NVMatrix& denoms, NVMatrix& targ
                               float powScale, bool blocked);
 
 class AvgPooler {
-private:
-    float _num;
 public:
-    AvgPooler(float num) : _num(num) {
-    }
     __device__ inline float operator()(const float a, const float b) const {
         return a + b;
     }
     __device__ inline float getBaseValue() const {
         return 0;
     }
-    __device__ inline float output(const float a) const {
-        return a / _num;
+    __device__ inline float output(const float a, const int regionSize) const {
+        return a / regionSize;
     }
 };
 
@@ -93,7 +89,7 @@ public:
     __device__ inline float getBaseValue() const {
         return -2e38; 
     }
-    __device__ inline float output(const float a) const {
+    __device__ inline float output(const float a, const int regionSize) const {
         return a;
     }
 };
@@ -106,7 +102,7 @@ public:
     __device__ inline float getBaseValue() const {
         return 0.0f;
     }
-    __device__ inline float output(const float a) const {
+    __device__ inline float output(const float a, const int regionSize) const {
         return a;
     }
 };
@@ -166,6 +162,7 @@ __global__ void kLocalPool(float* imgs, float* target, const int imgSize, const
     const int loopStartX = MAX(0, startImgPxX);
     const int loopEndY = MIN(imgSize, startImgPxY + subsX);
     const int loopEndX = MIN(imgSize, startImgPxX + subsX);
+    const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX);
     for (int y = loopStartY; y < loopEndY; y++) {
         for (int x = loopStartX; x < loopEndX; x++) {
             const int imgPx = y * imgSize + x;
@@ -186,7 +183,7 @@ __global__ void kLocalPool(float* imgs, float* target, const int imgSize, const
         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
             #pragma unroll
             for (int f = 0; f < filtersPerThread; f++) {
-                target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i]); 
+                target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize); 
             }
         }
     }
@@ -259,7 +256,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const
     const int loopStartX = MAX(startImgPxX, 0);
     const int loopEndY = MIN(imgSize, endImgPxY + 3);
     const int loopEndX = MIN(imgSize, endImgPxX + 3);
-
+    
     const int imgIdx = blockImgIdx + threadIdx.x;
     
     imgs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
@@ -273,7 +270,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const
             prod[f][i] = agg.getBaseValue(); 
         }
     }
-
+    int regionSize = 0;
     for (int y = loopStartY; y < loopEndY; y++) {
         const bool isInY = y >= myStartImgPxY && y < myEndImgPxY ;
         for (int x = loopStartX; x < loopEndX; x++) {
@@ -303,6 +300,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const
                         }
                     }
                 }
+                ++regionSize;
             }
             __syncthreads();
 
@@ -314,7 +312,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const
             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
                 #pragma unroll
                 for (int f = 0; f < filtersPerThread; f++) {
-                    target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i]); 
+                    target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize); 
                 }
             }
         }
diff --git a/src/cudaconv2/conv_util.cu b/src/cudaconv2/conv_util.cu
index 1bebdad..ea9dc91 100644
--- a/src/cudaconv2/conv_util.cu
+++ b/src/cudaconv2/conv_util.cu
@@ -1423,18 +1423,27 @@ __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize,
         }
     }
     
-    if  (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX 
+    if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX 
             && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) {
         
         for (int my = startOutputY; my < endOutputY; my++) {
+            const float regionStartY = fmaxf(0, startX + my * strideX);
+            const float regionEndY = fminf(imgSize, startX + my * strideX + subsX);
+            const float regionSizeY = regionEndY - regionStartY;
             for (int mx = startOutputX; mx < endOutputX; mx++) {
                 const int outputIdx = my * outputsX + mx;
+                const float regionStartX = fmaxf(0, startX + mx * strideX);
+                const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX);
+                const float regionSizeX = regionEndX - regionStartX;
+                // It's important to do the division here, because pushing division into the below
+                // loops makes the code 4x slower. 
+                const float regionSizeInv = 1.0f / (regionSizeX * regionSizeY);
                 #pragma unroll
                 for (int i = 0; i < imgsPerThread; i++) {
                     if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
                         #pragma unroll
                         for (int f = 0; f < filtersPerThread; f++) {
-                            prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
+                            prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X] * regionSizeInv;
                         }
                     }
                 }
@@ -1448,7 +1457,7 @@ __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize,
             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
                 #pragma unroll
                 for (int f = 0; f < filtersPerThread; f++) {
-                    target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i] / (subsX * subsX);
+                    target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
                 }
             }
         }
@@ -1458,7 +1467,7 @@ __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize,
             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
                 #pragma unroll
                 for (int f = 0; f < filtersPerThread; f++) {
-                    target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i] / (subsX * subsX);
+                    target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
                 }
             }
         }
@@ -2335,7 +2344,6 @@ void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs
                 }
             }
         }
-
     } else {
         int imgsPerThread = numImages % 64 == 0 ? 2 : 1;
         bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
diff --git a/src/layer.cu b/src/layer.cu
index 6e29862..fef642f 100644
--- a/src/layer.cu
+++ b/src/layer.cu
@@ -712,7 +712,7 @@ AvgPoolLayer::AvgPoolLayer(ConvNet* convNet, PyObject* paramsDict) : PoolLayer(c
 }
 
 void AvgPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType) {
-    convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler(_sizeX*_sizeX));
+    convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler());
 }
 
 void AvgPoolLayer::bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType) {
@@ -896,7 +896,7 @@ ContrastNormLayer::ContrastNormLayer(ConvNet* convNet, PyObject* paramsDict) : R
 
 void ContrastNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType) {
     NVMatrix& images = *_inputs[0];
-    convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler(_size*_size));
+    convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler());
     _meanDiffs.add(images, -1, 1);
     convContrastNorm(images, _meanDiffs, _denoms, getActs(), _channels, _size, _scale, _pow);
 }
diff --git a/src/nvmatrix/nvmatrix.cu b/src/nvmatrix/nvmatrix.cu
index a744931..18b4dbc 100644
--- a/src/nvmatrix/nvmatrix.cu
+++ b/src/nvmatrix/nvmatrix.cu
@@ -275,7 +275,6 @@ void NVMatrix::initRandom(unsigned long long seed) {
     rndDevStates[d] = NULL;
     CUDA_CALL(cudaMalloc((void **)&rndDevStates[d], NUM_RND_STREAMS * sizeof(curandState)));
     pthread_mutex_unlock(_rndMutex);
-    printf("initialized random for %d\n", d);
     kSetupCurand<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one
     cutilCheckMsg("initRandom: Kernel execution failed");
 }
@@ -518,6 +517,8 @@ NVMatrix& NVMatrix::reshaped(int numRows, int numCols) {
 void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
                     int srcStartCol, int srcEndCol,
                     int destStartRow, int destStartCol) const {
+    srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow;
+    srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol;
     NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol);
     NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol);
     srcSlice->apply(NVMatrixOps::Identity(), *destSlice);