From a929ca2058aa3e17ef9effa4260304e6dcda6a19 Mon Sep 17 00:00:00 2001 From: akrizhevsky Date: Tue, 17 Jul 2012 22:40:00 +0000 Subject: [PATCH] Fixed bug in contrast normalization backpropagation code which caused wrong gradients to be computed near image borders. --- data.py | 17 ++++++++++------- gpumodel.py | 14 +++++++++----- include/cudaconv2/conv_util.cuh | 22 ++++++++++------------ src/cudaconv2/conv_util.cu | 18 +++++++++++++----- src/layer.cu | 4 ++-- src/nvmatrix/nvmatrix.cu | 3 ++- 6 files changed, 46 insertions(+), 32 deletions(-) diff --git a/data.py b/data.py index b01f8af..cdfd661 100644 --- a/data.py +++ b/data.py @@ -45,7 +45,7 @@ def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, self.batch_meta = self.get_batch_meta(data_dir) self.data_dic = None self.test = test - self.batch_range_idx = batch_range.index(init_batchnum) + self.batch_idx = batch_range.index(init_batchnum) def get_next_batch(self): if self.data_dic is None or len(self.batch_range) > 1: @@ -84,13 +84,13 @@ def get_data_dims(self): return self.batch_meta['num_vis'] def advance_batch(self): - self.batch_range_idx = self.get_next_batch_idx() - self.curr_batchnum = self.batch_range[self.batch_range_idx] - if self.batch_range_idx == 0: # we wrapped + self.batch_idx = self.get_next_batch_idx() + self.curr_batchnum = self.batch_range[self.batch_idx] + if self.batch_idx == 0: # we wrapped self.curr_epoch += 1 def get_next_batch_idx(self): - return (self.batch_range_idx + 1) % len(self.batch_range) + return (self.batch_idx + 1) % len(self.batch_range) def get_next_batch_num(self): return self.batch_range[self.get_next_batch_idx()] @@ -150,7 +150,7 @@ def __init__(self, data_dim): self.batch_meta = {'num_vis': data_dim, 'data_in_rows':True} self.curr_epoch = 1 self.curr_batchnum = 1 - self.batch_range_idx = 0 + self.batch_idx = 0 def get_next_batch(self): epoch, batchnum = self.curr_epoch, self.curr_batchnum @@ -170,7 +170,7 @@ def __init__(self, data_dim, num_classes=10, num_cases=512): self.num_classes = num_classes self.curr_epoch = 1 self.curr_batchnum = 1 - self.batch_range_idx=0 + self.batch_idx=0 def get_num_classes(self): return self.num_classes @@ -197,6 +197,9 @@ def get_next_batch(self): return epoch, batchnum, self.data_dic[batchnum - self.batch_range[0]] class LabeledDataProvider(DataProvider): + def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False): + DataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) + def get_num_classes(self): return len(self.batch_meta['label_names']) diff --git a/gpumodel.py b/gpumodel.py index dfc5b34..711bda8 100644 --- a/gpumodel.py +++ b/gpumodel.py @@ -225,12 +225,16 @@ def get_test_error(self): while True: data = next_data self.start_batch(data, train=False) - if not self.test_one and data[1] < self.test_batch_range[-1]: # load next batch + load_next = not self.test_one and data[1] < self.test_batch_range[-1] + if load_next: # load next batch next_data = self.get_next_batch(train=False) - test_outputs += [self.finish_batch()] - else: - test_outputs += [self.finish_batch()] - break + test_outputs += [self.finish_batch()] + if self.test_only: # Print the individual batch results for safety + print "batch %d: %s" % (data[1], str(test_outputs[-1])) + if not load_next: + break + sys.stdout.flush() + return self.aggregate_test_outputs(test_outputs) def set_var(self, var_name, var_val): diff --git a/include/cudaconv2/conv_util.cuh b/include/cudaconv2/conv_util.cuh index 4eb00c1..040387c 100644 --- a/include/cudaconv2/conv_util.cuh +++ b/include/cudaconv2/conv_util.cuh @@ -69,19 +69,15 @@ void convResponseNormCrossMap(NVMatrix& images, NVMatrix& denoms, NVMatrix& targ float powScale, bool blocked); class AvgPooler { -private: - float _num; public: - AvgPooler(float num) : _num(num) { - } __device__ inline float operator()(const float a, const float b) const { return a + b; } __device__ inline float getBaseValue() const { return 0; } - __device__ inline float output(const float a) const { - return a / _num; + __device__ inline float output(const float a, const int regionSize) const { + return a / regionSize; } }; @@ -93,7 +89,7 @@ public: __device__ inline float getBaseValue() const { return -2e38; } - __device__ inline float output(const float a) const { + __device__ inline float output(const float a, const int regionSize) const { return a; } }; @@ -106,7 +102,7 @@ public: __device__ inline float getBaseValue() const { return 0.0f; } - __device__ inline float output(const float a) const { + __device__ inline float output(const float a, const int regionSize) const { return a; } }; @@ -166,6 +162,7 @@ __global__ void kLocalPool(float* imgs, float* target, const int imgSize, const const int loopStartX = MAX(0, startImgPxX); const int loopEndY = MIN(imgSize, startImgPxY + subsX); const int loopEndX = MIN(imgSize, startImgPxX + subsX); + const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX); for (int y = loopStartY; y < loopEndY; y++) { for (int x = loopStartX; x < loopEndX; x++) { const int imgPx = y * imgSize + x; @@ -186,7 +183,7 @@ __global__ void kLocalPool(float* imgs, float* target, const int imgSize, const if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i]); + target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize); } } } @@ -259,7 +256,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const const int loopStartX = MAX(startImgPxX, 0); const int loopEndY = MIN(imgSize, endImgPxY + 3); const int loopEndX = MIN(imgSize, endImgPxX + 3); - + const int imgIdx = blockImgIdx + threadIdx.x; imgs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; @@ -273,7 +270,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const prod[f][i] = agg.getBaseValue(); } } - + int regionSize = 0; for (int y = loopStartY; y < loopEndY; y++) { const bool isInY = y >= myStartImgPxY && y < myEndImgPxY ; for (int x = loopStartX; x < loopEndX; x++) { @@ -303,6 +300,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const } } } + ++regionSize; } __syncthreads(); @@ -314,7 +312,7 @@ __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i]); + target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize); } } } diff --git a/src/cudaconv2/conv_util.cu b/src/cudaconv2/conv_util.cu index 1bebdad..ea9dc91 100644 --- a/src/cudaconv2/conv_util.cu +++ b/src/cudaconv2/conv_util.cu @@ -1423,18 +1423,27 @@ __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize, } } - if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX + if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) { for (int my = startOutputY; my < endOutputY; my++) { + const float regionStartY = fmaxf(0, startX + my * strideX); + const float regionEndY = fminf(imgSize, startX + my * strideX + subsX); + const float regionSizeY = regionEndY - regionStartY; for (int mx = startOutputX; mx < endOutputX; mx++) { const int outputIdx = my * outputsX + mx; + const float regionStartX = fmaxf(0, startX + mx * strideX); + const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX); + const float regionSizeX = regionEndX - regionStartX; + // It's important to do the division here, because pushing division into the below + // loops makes the code 4x slower. + const float regionSizeInv = 1.0f / (regionSizeX * regionSizeY); #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X]; + prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X] * regionSizeInv; } } } @@ -1448,7 +1457,7 @@ __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize, if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i] / (subsX * subsX); + target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; } } } @@ -1458,7 +1467,7 @@ __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize, if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i] / (subsX * subsX); + target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i]; } } } @@ -2335,7 +2344,6 @@ void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs } } } - } else { int imgsPerThread = numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; diff --git a/src/layer.cu b/src/layer.cu index 6e29862..fef642f 100644 --- a/src/layer.cu +++ b/src/layer.cu @@ -712,7 +712,7 @@ AvgPoolLayer::AvgPoolLayer(ConvNet* convNet, PyObject* paramsDict) : PoolLayer(c } void AvgPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType) { - convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler(_sizeX*_sizeX)); + convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler()); } void AvgPoolLayer::bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType) { @@ -896,7 +896,7 @@ ContrastNormLayer::ContrastNormLayer(ConvNet* convNet, PyObject* paramsDict) : R void ContrastNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType) { NVMatrix& images = *_inputs[0]; - convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler(_size*_size)); + convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler()); _meanDiffs.add(images, -1, 1); convContrastNorm(images, _meanDiffs, _denoms, getActs(), _channels, _size, _scale, _pow); } diff --git a/src/nvmatrix/nvmatrix.cu b/src/nvmatrix/nvmatrix.cu index a744931..18b4dbc 100644 --- a/src/nvmatrix/nvmatrix.cu +++ b/src/nvmatrix/nvmatrix.cu @@ -275,7 +275,6 @@ void NVMatrix::initRandom(unsigned long long seed) { rndDevStates[d] = NULL; CUDA_CALL(cudaMalloc((void **)&rndDevStates[d], NUM_RND_STREAMS * sizeof(curandState))); pthread_mutex_unlock(_rndMutex); - printf("initialized random for %d\n", d); kSetupCurand<<>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one cutilCheckMsg("initRandom: Kernel execution failed"); } @@ -518,6 +517,8 @@ NVMatrix& NVMatrix::reshaped(int numRows, int numCols) { void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol) const { + srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow; + srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol; NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol); NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol); srcSlice->apply(NVMatrixOps::Identity(), *destSlice);