diff --git a/README.md b/README.md index 5524be9..c1e8c57 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,8 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip | Framework | Time [1](#foottime)| |:---|:---| -| Keras (Theano backend) BN mode 0 [3](#footBNmode)| 347.546 ms| -| Keras (Theano backend) BN mode 2 | 269.074 ms| +| Keras (Theano backend) + BN| 347.546 ms| | Keras (TensorFlow backend) mode 0 | 560.938 ms| -| Keras (TensorFlow backend) mode 2 | 504.966 ms| |Tensorflow NHWC + BN no XLA| 493.235 ms| |Tensorflow NHWC + BN + XLA| 341.702 ms| |Tensorflow NHWC + fused BN no XLA| 395.963 ms| @@ -38,8 +36,6 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip 2: Note that at the moment, keras uses traditional NHWC tensorflow ordering -3: Mode 0 = use per-batch statistics to normalize the data, and during testing use running averages computed during the training phase. Mode 2 = use per-batch statistics to normalize the data during training and testing. - ### System specs - Ubuntu 14.04 @@ -47,7 +43,7 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip - cuDNN 5.1.10 - theano '0.9.0beta1.dev-173eef98360c23d7418bad3a36f5fb938724f05f' (cuda backend) - tensorflow 1.0.0 (compiled from source with CUDA 8.0 cuDNN 5.1.10 and XLA JIT) -- Keras 1.2.2 +- Keras 2.0.1 ## Usage diff --git a/benchmark_keras.py b/benchmark_keras.py index bbbd84e..26c6dc2 100644 --- a/benchmark_keras.py +++ b/benchmark_keras.py @@ -4,27 +4,30 @@ import keras.backend as K from keras.models import Model from keras.layers import Flatten, Dense, Input -from keras.layers import Convolution2D, MaxPooling2D +from keras.layers.convolutional import Conv2D +from keras.layers.pooling import MaxPooling2D from keras.optimizers import SGD from keras.layers import Activation from keras.layers.normalization import BatchNormalization +from keras.datasets import cifar10 import utils +from keras.utils import np_utils -def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, bn_mode): +def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, data_format): for i in range(nb_conv): - x = Convolution2D(nb_filters, 3, 3, border_mode='same')(x) + x = Conv2D(filters=nb_filters, kernel_size=(3, 3), padding='same', data_format=data_format)(x) if use_bn: - x = BatchNormalization(mode=bn_mode, axis=bn_axis)(x) + x = BatchNormalization(scale=False, axis=bn_axis)(x) x = Activation("relu")(x) - x = MaxPooling2D((2, 2), strides=(2, 2))(x) + x = MaxPooling2D((2, 2), strides=(2, 2), padding='valid', data_format=data_format)(x) return x -def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2): +def run_VGG16(batch_size=16, n_trials=100, use_bn=False, data_format="NCHW"): """Run VGG16 experiment Args: @@ -33,33 +36,39 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2): """ # Determine proper input shape - if K.image_dim_ordering() == 'th': + if data_format == "NCHW": + assert K.image_data_format() == 'channels_first', "Change your keras.json file" + # Update NCHW to channels_first (keras conventions) + data_format = "channels_first" input_shape = (3, 224, 224) bn_axis = 1 else: + assert K.image_data_format() == 'channels_last', "Change your keras.json file" + data_format = "channels_last" input_shape = (224, 224, 3) bn_axis = -1 img_input = Input(shape=input_shape) # Block 1 - x = vgg_block(img_input, 64, use_bn, 2, bn_axis, bn_mode) + x = vgg_block(img_input, 64, use_bn, 2, bn_axis, data_format) # Block 2 - x = vgg_block(x, 128, use_bn, 2, bn_axis, bn_mode) + x = vgg_block(x, 128, use_bn, 2, bn_axis, data_format) # Block 3 - x = vgg_block(x, 256, use_bn, 3, bn_axis, bn_mode) + x = vgg_block(x, 256, use_bn, 3, bn_axis, data_format) # Block 4 - x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode) + x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format) # Block 5 - x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode) + x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format) # Classification block x = Flatten(name='flatten')(x) - x = Dense(4096, activation='relu', name='fc1')(x) - x = Dense(4096, activation='relu', name='fc2')(x) - x = Dense(1000, activation='softmax', name='predictions')(x) + x = Dense(units=4096, activation='relu', name='fc1')(x) + x = Dense(units=4096, activation='relu', name='fc2')(x) + x = Dense(units=1000, activation='softmax', name='predictions')(x) # Create model - model = Model(img_input, x) + model = Model(inputs=img_input, outputs=x) + model.summary() opt = SGD() model.compile(loss='categorical_crossentropy', optimizer=opt) @@ -87,3 +96,67 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2): utils.print_module("Keras backend: %s" % K.backend()) utils.print_module("Backend version: %s" % backend.__version__) utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials)) + + +def run_SimpleCNN(batch_size): + + # Determine proper input shape + if K.image_dim_ordering() == 'th': + input_shape = (3, 32, 32) + else: + input_shape = (32, 32, 3) + + img_input = Input(shape=input_shape) + # Block 1 + x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(img_input) + x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(x) + x = MaxPooling2D()(x) + # Block 2 + x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x) + x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x) + x = MaxPooling2D()(x) + # Dense part + x = Flatten(name='flatten')(x) + x = Dense(512,activation="relu")(x) + x = Dense(10,activation="relu")(x) + + # Create model + model = Model(img_input, x) + + opt = SGD() + model.compile(loss='categorical_crossentropy', optimizer=opt) + + # Input data + data = cifar10.load_data() + X, Y = data[0][0], data[0][1] + Y = np_utils.to_categorical(Y, nb_classes=10) + + # warmup + model.train_on_batch(X[:32], Y[:32]) + + # Split in chunks of size batch size + num_elem = X.shape[0] + chunk_size = batch_size + num_chunks = num_elem / chunk_size + list_chunks = np.array_split(np.arange(num_elem), num_chunks) + + for e in range(10): + t0 = time() + for index, chunk_idx in enumerate(list_chunks): + X_batch, Y_batch = X[chunk_idx], Y[chunk_idx] + model.train_on_batch(X_batch, Y_batch) + t1 = time() + + print t1 - t0 + + # Import backend to get version number + if K.backend() == "tensorflow": + import tensorflow as backend + elif K.backend() == "theano": + import theano as backend + + # Print summary + utils.print_module("Keras version: %s" % keras.__version__) + utils.print_module("Keras backend: %s" % K.backend()) + utils.print_module("Backend version: %s" % backend.__version__) + utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials)) diff --git a/benchmark_pytorch.py b/benchmark_pytorch.py new file mode 100644 index 0000000..df97de7 --- /dev/null +++ b/benchmark_pytorch.py @@ -0,0 +1,143 @@ +import time +import math +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torchvision.datasets as dset +from torch.autograd import Variable +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.models import vgg +import utils + + +class SimpleCNN(nn.Module): + def __init__(self): + super(SimpleCNN, self).__init__() + + self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=True) + self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1, bias=True) + self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True) + self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=True) + + self.fc1 = nn.Linear(4096, 512) + self.fc2 = nn.Linear(512, 10) + + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() + + def forward(self, x): + + # First conv block + out = F.relu(self.conv1(x)) + out = F.relu(self.conv2(out)) + out = F.max_pool2d(out, 2) + + # Second conv block + out = F.relu(self.conv3(out)) + out = F.relu(self.conv4(out)) + out = F.max_pool2d(out, 2) + + # Flatten + out = out.view(out.size(0), -1) + + # Linear + out = F.relu(self.fc1(out)) + out = F.log_softmax(self.fc2(out)) + + return out + + +def run_SimpleCNN(batch_size, nb_epoch): + + normMean = [0.49139968, 0.48215827, 0.44653124] + normStd = [0.24703233, 0.24348505, 0.26158768] + normTransform = transforms.Normalize(normMean, normStd) + + list_transforms = [transforms.ToTensor(), normTransform] + trainTransform = transforms.Compose(list_transforms) + + kwargs = {'num_workers': 0, 'pin_memory': True} + dataset = dset.CIFAR10(root='cifar', train=True, download=True, transform=trainTransform) + trainLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs) + + net = SimpleCNN() + + net = net.cuda() + + optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4) + + for epoch in range(nb_epoch): + + s = time.time() + + for batch_idx, (data, target) in enumerate(trainLoader): + data, target = data.cuda(), target.cuda() + data, target = Variable(data), Variable(target) + optimizer.zero_grad() + output = net(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + + print time.time() - s + + +def run_VGG16(batch_size, n_trials): + + # Initialize network + net = vgg.vgg16() + net.cuda() + + # Loss and optimizer + criterion = nn.CrossEntropyLoss().cuda() + optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) + + # Data + n_classes = 1000 + labels = np.random.randint(0, 1000, batch_size * n_trials).astype(np.uint8).tolist() + labels = torch.LongTensor(labels) + inputs = torch.randn(batch_size * n_trials, 3, 224, 224) + + dataset = torch.utils.data.TensorDataset(inputs, labels) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True) + + t0 = time.time() + n = 0 + for i, (X, y) in enumerate(dataloader): + + ll = Variable(y.cuda(async=True)) + inp = Variable(X.cuda(async=True)) + + # forward pass + outputs = net(inp) + + # compute loss + loss = criterion(outputs, ll) + + # zero the parameter gradients + optimizer.zero_grad() + + loss.backward() + optimizer.step() + + n += 1 + + t1 = time.time() + + # Print summary + utils.print_module("pytorch version: %s" % torch.__version__) + utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials)) \ No newline at end of file diff --git a/benchmark_tensorflow.py b/benchmark_tensorflow.py index cf081d4..bc4efa5 100644 --- a/benchmark_tensorflow.py +++ b/benchmark_tensorflow.py @@ -37,6 +37,7 @@ def linear(x, n_in, n_out, bias=True, scope=None): class Vgg16Model(): """ VGG16 model adapted from https://github.com/machrisaa/tensorflow-vgg""" + def __init__(self, data_format="NCHW", use_bn=False, use_fused=False): self.image_mean = np.array([103.939, 116.779, 123.68]) self.data_format = data_format @@ -159,6 +160,7 @@ def run_VGG16(batch_size, n_trials, data_format="NHWC", use_XLA=False, use_bn=Fa t0 = time.time() for i in range(n_trials): + t = time.time() sess.run([apply_gradient_op]) t1 = time.time() diff --git a/main.py b/main.py index 0dc0188..fa4d6b6 100644 --- a/main.py +++ b/main.py @@ -13,14 +13,14 @@ description='Benchmarks largely inspired by https://github.com/aizvorski/vgg-benchmarks') parser.add_argument('--run_keras', action="store_true", help="Run keras benchmark") parser.add_argument('--run_tensorflow', action="store_true", help="Run pure tensorflow benchmark") + parser.add_argument('--run_pytorch', action="store_true", help="Run pytorch benchmark") parser.add_argument('--batch_size', default=16, type=int, help="Batch size") parser.add_argument('--n_trials', default=100, type=int, help="Number of full iterations (forward + backward + update)") parser.add_argument('--use_XLA', action="store_true", help="Whether to use XLA compiler") - parser.add_argument('--data_format', default="NHWC", type=str, help="Tensorflow image format") + parser.add_argument('--data_format', default="NCHW", type=str, help="Image format") parser.add_argument('--use_bn', action="store_true",help="Use batch normalization (tf benchmark)") parser.add_argument('--use_fused', action="store_true",help="Use fused batch normalization (tf benchmark)") - parser.add_argument('--bn_mode', default=2, type=int, help="keras batch norm mode") args = parser.parse_args() @@ -33,7 +33,12 @@ benchmark_keras.run_VGG16(args.batch_size, args.n_trials, args.use_bn, - args.bn_mode) + args.data_format) + + # import benchmark_keras + # utils.print_module("Running %s..." % benchmark_keras.__name__) + # utils.print_dict(args.__dict__) + # benchmark_keras.run_SimpleCNN(args.batch_size) if args.run_tensorflow: import benchmark_tensorflow @@ -45,3 +50,17 @@ args.use_XLA, args.use_bn, args.use_fused) + + if args.run_pytorch: + import benchmark_pytorch + + utils.print_module("Running %s..." % benchmark_pytorch.__name__) + utils.print_dict(args.__dict__) + benchmark_pytorch.run_VGG16(args.batch_size, + args.n_trials,) + + + # utils.print_module("Running %s..." % benchmark_pytorch.__name__) + # utils.print_dict(args.__dict__) + # benchmark_pytorch.run_SimpleCNN(args.batch_size, + # args.n_trials,)