diff --git a/README.md b/README.md
index 5524be9..c1e8c57 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,8 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip
| Framework | Time [1](#foottime)|
|:---|:---|
-| Keras (Theano backend) BN mode 0 [3](#footBNmode)| 347.546 ms|
-| Keras (Theano backend) BN mode 2 | 269.074 ms|
+| Keras (Theano backend) + BN| 347.546 ms|
| Keras (TensorFlow backend) mode 0 | 560.938 ms|
-| Keras (TensorFlow backend) mode 2 | 504.966 ms|
|Tensorflow NHWC + BN no XLA| 493.235 ms|
|Tensorflow NHWC + BN + XLA| 341.702 ms|
|Tensorflow NHWC + fused BN no XLA| 395.963 ms|
@@ -38,8 +36,6 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip
2: Note that at the moment, keras uses traditional NHWC tensorflow ordering
-3: Mode 0 = use per-batch statistics to normalize the data, and during testing use running averages computed during the training phase. Mode 2 = use per-batch statistics to normalize the data during training and testing.
-
### System specs
- Ubuntu 14.04
@@ -47,7 +43,7 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip
- cuDNN 5.1.10
- theano '0.9.0beta1.dev-173eef98360c23d7418bad3a36f5fb938724f05f' (cuda backend)
- tensorflow 1.0.0 (compiled from source with CUDA 8.0 cuDNN 5.1.10 and XLA JIT)
-- Keras 1.2.2
+- Keras 2.0.1
## Usage
diff --git a/benchmark_keras.py b/benchmark_keras.py
index bbbd84e..26c6dc2 100644
--- a/benchmark_keras.py
+++ b/benchmark_keras.py
@@ -4,27 +4,30 @@
import keras.backend as K
from keras.models import Model
from keras.layers import Flatten, Dense, Input
-from keras.layers import Convolution2D, MaxPooling2D
+from keras.layers.convolutional import Conv2D
+from keras.layers.pooling import MaxPooling2D
from keras.optimizers import SGD
from keras.layers import Activation
from keras.layers.normalization import BatchNormalization
+from keras.datasets import cifar10
import utils
+from keras.utils import np_utils
-def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, bn_mode):
+def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, data_format):
for i in range(nb_conv):
- x = Convolution2D(nb_filters, 3, 3, border_mode='same')(x)
+ x = Conv2D(filters=nb_filters, kernel_size=(3, 3), padding='same', data_format=data_format)(x)
if use_bn:
- x = BatchNormalization(mode=bn_mode, axis=bn_axis)(x)
+ x = BatchNormalization(scale=False, axis=bn_axis)(x)
x = Activation("relu")(x)
- x = MaxPooling2D((2, 2), strides=(2, 2))(x)
+ x = MaxPooling2D((2, 2), strides=(2, 2), padding='valid', data_format=data_format)(x)
return x
-def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
+def run_VGG16(batch_size=16, n_trials=100, use_bn=False, data_format="NCHW"):
"""Run VGG16 experiment
Args:
@@ -33,33 +36,39 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
"""
# Determine proper input shape
- if K.image_dim_ordering() == 'th':
+ if data_format == "NCHW":
+ assert K.image_data_format() == 'channels_first', "Change your keras.json file"
+ # Update NCHW to channels_first (keras conventions)
+ data_format = "channels_first"
input_shape = (3, 224, 224)
bn_axis = 1
else:
+ assert K.image_data_format() == 'channels_last', "Change your keras.json file"
+ data_format = "channels_last"
input_shape = (224, 224, 3)
bn_axis = -1
img_input = Input(shape=input_shape)
# Block 1
- x = vgg_block(img_input, 64, use_bn, 2, bn_axis, bn_mode)
+ x = vgg_block(img_input, 64, use_bn, 2, bn_axis, data_format)
# Block 2
- x = vgg_block(x, 128, use_bn, 2, bn_axis, bn_mode)
+ x = vgg_block(x, 128, use_bn, 2, bn_axis, data_format)
# Block 3
- x = vgg_block(x, 256, use_bn, 3, bn_axis, bn_mode)
+ x = vgg_block(x, 256, use_bn, 3, bn_axis, data_format)
# Block 4
- x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode)
+ x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format)
# Block 5
- x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode)
+ x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format)
# Classification block
x = Flatten(name='flatten')(x)
- x = Dense(4096, activation='relu', name='fc1')(x)
- x = Dense(4096, activation='relu', name='fc2')(x)
- x = Dense(1000, activation='softmax', name='predictions')(x)
+ x = Dense(units=4096, activation='relu', name='fc1')(x)
+ x = Dense(units=4096, activation='relu', name='fc2')(x)
+ x = Dense(units=1000, activation='softmax', name='predictions')(x)
# Create model
- model = Model(img_input, x)
+ model = Model(inputs=img_input, outputs=x)
+ model.summary()
opt = SGD()
model.compile(loss='categorical_crossentropy', optimizer=opt)
@@ -87,3 +96,67 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
utils.print_module("Keras backend: %s" % K.backend())
utils.print_module("Backend version: %s" % backend.__version__)
utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
+
+
+def run_SimpleCNN(batch_size):
+
+ # Determine proper input shape
+ if K.image_dim_ordering() == 'th':
+ input_shape = (3, 32, 32)
+ else:
+ input_shape = (32, 32, 3)
+
+ img_input = Input(shape=input_shape)
+ # Block 1
+ x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(img_input)
+ x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(x)
+ x = MaxPooling2D()(x)
+ # Block 2
+ x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x)
+ x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x)
+ x = MaxPooling2D()(x)
+ # Dense part
+ x = Flatten(name='flatten')(x)
+ x = Dense(512,activation="relu")(x)
+ x = Dense(10,activation="relu")(x)
+
+ # Create model
+ model = Model(img_input, x)
+
+ opt = SGD()
+ model.compile(loss='categorical_crossentropy', optimizer=opt)
+
+ # Input data
+ data = cifar10.load_data()
+ X, Y = data[0][0], data[0][1]
+ Y = np_utils.to_categorical(Y, nb_classes=10)
+
+ # warmup
+ model.train_on_batch(X[:32], Y[:32])
+
+ # Split in chunks of size batch size
+ num_elem = X.shape[0]
+ chunk_size = batch_size
+ num_chunks = num_elem / chunk_size
+ list_chunks = np.array_split(np.arange(num_elem), num_chunks)
+
+ for e in range(10):
+ t0 = time()
+ for index, chunk_idx in enumerate(list_chunks):
+ X_batch, Y_batch = X[chunk_idx], Y[chunk_idx]
+ model.train_on_batch(X_batch, Y_batch)
+ t1 = time()
+
+ print t1 - t0
+
+ # Import backend to get version number
+ if K.backend() == "tensorflow":
+ import tensorflow as backend
+ elif K.backend() == "theano":
+ import theano as backend
+
+ # Print summary
+ utils.print_module("Keras version: %s" % keras.__version__)
+ utils.print_module("Keras backend: %s" % K.backend())
+ utils.print_module("Backend version: %s" % backend.__version__)
+ utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
diff --git a/benchmark_pytorch.py b/benchmark_pytorch.py
new file mode 100644
index 0000000..df97de7
--- /dev/null
+++ b/benchmark_pytorch.py
@@ -0,0 +1,143 @@
+import time
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import torchvision.datasets as dset
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+from torchvision.models import vgg
+import utils
+
+
+class SimpleCNN(nn.Module):
+ def __init__(self):
+ super(SimpleCNN, self).__init__()
+
+ self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=True)
+ self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1, bias=True)
+ self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True)
+ self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=True)
+
+ self.fc1 = nn.Linear(4096, 512)
+ self.fc2 = nn.Linear(512, 10)
+
+ self._initialize_weights()
+
+ def _initialize_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ m.weight.data.normal_(0, math.sqrt(2. / n))
+ if m.bias is not None:
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Linear):
+ n = m.weight.size(1)
+ m.weight.data.normal_(0, 0.01)
+ m.bias.data.zero_()
+
+ def forward(self, x):
+
+ # First conv block
+ out = F.relu(self.conv1(x))
+ out = F.relu(self.conv2(out))
+ out = F.max_pool2d(out, 2)
+
+ # Second conv block
+ out = F.relu(self.conv3(out))
+ out = F.relu(self.conv4(out))
+ out = F.max_pool2d(out, 2)
+
+ # Flatten
+ out = out.view(out.size(0), -1)
+
+ # Linear
+ out = F.relu(self.fc1(out))
+ out = F.log_softmax(self.fc2(out))
+
+ return out
+
+
+def run_SimpleCNN(batch_size, nb_epoch):
+
+ normMean = [0.49139968, 0.48215827, 0.44653124]
+ normStd = [0.24703233, 0.24348505, 0.26158768]
+ normTransform = transforms.Normalize(normMean, normStd)
+
+ list_transforms = [transforms.ToTensor(), normTransform]
+ trainTransform = transforms.Compose(list_transforms)
+
+ kwargs = {'num_workers': 0, 'pin_memory': True}
+ dataset = dset.CIFAR10(root='cifar', train=True, download=True, transform=trainTransform)
+ trainLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs)
+
+ net = SimpleCNN()
+
+ net = net.cuda()
+
+ optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4)
+
+ for epoch in range(nb_epoch):
+
+ s = time.time()
+
+ for batch_idx, (data, target) in enumerate(trainLoader):
+ data, target = data.cuda(), target.cuda()
+ data, target = Variable(data), Variable(target)
+ optimizer.zero_grad()
+ output = net(data)
+ loss = F.nll_loss(output, target)
+ loss.backward()
+ optimizer.step()
+
+ print time.time() - s
+
+
+def run_VGG16(batch_size, n_trials):
+
+ # Initialize network
+ net = vgg.vgg16()
+ net.cuda()
+
+ # Loss and optimizer
+ criterion = nn.CrossEntropyLoss().cuda()
+ optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+ # Data
+ n_classes = 1000
+ labels = np.random.randint(0, 1000, batch_size * n_trials).astype(np.uint8).tolist()
+ labels = torch.LongTensor(labels)
+ inputs = torch.randn(batch_size * n_trials, 3, 224, 224)
+
+ dataset = torch.utils.data.TensorDataset(inputs, labels)
+ dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True)
+
+ t0 = time.time()
+ n = 0
+ for i, (X, y) in enumerate(dataloader):
+
+ ll = Variable(y.cuda(async=True))
+ inp = Variable(X.cuda(async=True))
+
+ # forward pass
+ outputs = net(inp)
+
+ # compute loss
+ loss = criterion(outputs, ll)
+
+ # zero the parameter gradients
+ optimizer.zero_grad()
+
+ loss.backward()
+ optimizer.step()
+
+ n += 1
+
+ t1 = time.time()
+
+ # Print summary
+ utils.print_module("pytorch version: %s" % torch.__version__)
+ utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
\ No newline at end of file
diff --git a/benchmark_tensorflow.py b/benchmark_tensorflow.py
index cf081d4..bc4efa5 100644
--- a/benchmark_tensorflow.py
+++ b/benchmark_tensorflow.py
@@ -37,6 +37,7 @@ def linear(x, n_in, n_out, bias=True, scope=None):
class Vgg16Model():
""" VGG16 model adapted from https://github.com/machrisaa/tensorflow-vgg"""
+
def __init__(self, data_format="NCHW", use_bn=False, use_fused=False):
self.image_mean = np.array([103.939, 116.779, 123.68])
self.data_format = data_format
@@ -159,6 +160,7 @@ def run_VGG16(batch_size, n_trials, data_format="NHWC", use_XLA=False, use_bn=Fa
t0 = time.time()
for i in range(n_trials):
+ t = time.time()
sess.run([apply_gradient_op])
t1 = time.time()
diff --git a/main.py b/main.py
index 0dc0188..fa4d6b6 100644
--- a/main.py
+++ b/main.py
@@ -13,14 +13,14 @@
description='Benchmarks largely inspired by https://github.com/aizvorski/vgg-benchmarks')
parser.add_argument('--run_keras', action="store_true", help="Run keras benchmark")
parser.add_argument('--run_tensorflow', action="store_true", help="Run pure tensorflow benchmark")
+ parser.add_argument('--run_pytorch', action="store_true", help="Run pytorch benchmark")
parser.add_argument('--batch_size', default=16, type=int, help="Batch size")
parser.add_argument('--n_trials', default=100, type=int,
help="Number of full iterations (forward + backward + update)")
parser.add_argument('--use_XLA', action="store_true", help="Whether to use XLA compiler")
- parser.add_argument('--data_format', default="NHWC", type=str, help="Tensorflow image format")
+ parser.add_argument('--data_format', default="NCHW", type=str, help="Image format")
parser.add_argument('--use_bn', action="store_true",help="Use batch normalization (tf benchmark)")
parser.add_argument('--use_fused', action="store_true",help="Use fused batch normalization (tf benchmark)")
- parser.add_argument('--bn_mode', default=2, type=int, help="keras batch norm mode")
args = parser.parse_args()
@@ -33,7 +33,12 @@
benchmark_keras.run_VGG16(args.batch_size,
args.n_trials,
args.use_bn,
- args.bn_mode)
+ args.data_format)
+
+ # import benchmark_keras
+ # utils.print_module("Running %s..." % benchmark_keras.__name__)
+ # utils.print_dict(args.__dict__)
+ # benchmark_keras.run_SimpleCNN(args.batch_size)
if args.run_tensorflow:
import benchmark_tensorflow
@@ -45,3 +50,17 @@
args.use_XLA,
args.use_bn,
args.use_fused)
+
+ if args.run_pytorch:
+ import benchmark_pytorch
+
+ utils.print_module("Running %s..." % benchmark_pytorch.__name__)
+ utils.print_dict(args.__dict__)
+ benchmark_pytorch.run_VGG16(args.batch_size,
+ args.n_trials,)
+
+
+ # utils.print_module("Running %s..." % benchmark_pytorch.__name__)
+ # utils.print_dict(args.__dict__)
+ # benchmark_pytorch.run_SimpleCNN(args.batch_size,
+ # args.n_trials,)