Skip to content

Commit

Permalink
Update benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
Thibault de Boissiere committed Mar 21, 2017
1 parent dd5e3ae commit 9223f8a
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 25 deletions.
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip

| Framework | Time <sup>[1](#foottime)</sup>|
|:---|:---|
| Keras (Theano backend) BN mode 0 <sup>[3](#footBNmode)</sup>| 347.546 ms|
| Keras (Theano backend) BN mode 2 | 269.074 ms|
| Keras (Theano backend) + BN| 347.546 ms|
| Keras (TensorFlow backend) mode 0 | 560.938 ms|
| Keras (TensorFlow backend) mode 2 | 504.966 ms|
|Tensorflow NHWC + BN no XLA| 493.235 ms|
|Tensorflow NHWC + BN + XLA| 341.702 ms|
|Tensorflow NHWC + fused BN no XLA| 395.963 ms|
Expand All @@ -38,16 +36,14 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip

<a name="kerasnote">2</a>: Note that at the moment, keras uses traditional NHWC tensorflow ordering

<a name="footBNmode">3</a>: Mode 0 = use per-batch statistics to normalize the data, and during testing use running averages computed during the training phase. Mode 2 = use per-batch statistics to normalize the data during training and testing.

### System specs

- Ubuntu 14.04
- Cuda 8.0
- cuDNN 5.1.10
- theano '0.9.0beta1.dev-173eef98360c23d7418bad3a36f5fb938724f05f' (cuda backend)
- tensorflow 1.0.0 (compiled from source with CUDA 8.0 cuDNN 5.1.10 and XLA JIT)
- Keras 1.2.2
- Keras 2.0.1

## Usage

Expand Down
105 changes: 89 additions & 16 deletions benchmark_keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,30 @@
import keras.backend as K
from keras.models import Model
from keras.layers import Flatten, Dense, Input
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.optimizers import SGD
from keras.layers import Activation
from keras.layers.normalization import BatchNormalization
from keras.datasets import cifar10
import utils
from keras.utils import np_utils


def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, bn_mode):
def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, data_format):

for i in range(nb_conv):
x = Convolution2D(nb_filters, 3, 3, border_mode='same')(x)
x = Conv2D(filters=nb_filters, kernel_size=(3, 3), padding='same', data_format=data_format)(x)
if use_bn:
x = BatchNormalization(mode=bn_mode, axis=bn_axis)(x)
x = BatchNormalization(scale=False, axis=bn_axis)(x)
x = Activation("relu")(x)

x = MaxPooling2D((2, 2), strides=(2, 2))(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='valid', data_format=data_format)(x)

return x


def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
def run_VGG16(batch_size=16, n_trials=100, use_bn=False, data_format="NCHW"):
"""Run VGG16 experiment
Args:
Expand All @@ -33,33 +36,39 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
"""

# Determine proper input shape
if K.image_dim_ordering() == 'th':
if data_format == "NCHW":
assert K.image_data_format() == 'channels_first', "Change your keras.json file"
# Update NCHW to channels_first (keras conventions)
data_format = "channels_first"
input_shape = (3, 224, 224)
bn_axis = 1
else:
assert K.image_data_format() == 'channels_last', "Change your keras.json file"
data_format = "channels_last"
input_shape = (224, 224, 3)
bn_axis = -1

img_input = Input(shape=input_shape)
# Block 1
x = vgg_block(img_input, 64, use_bn, 2, bn_axis, bn_mode)
x = vgg_block(img_input, 64, use_bn, 2, bn_axis, data_format)
# Block 2
x = vgg_block(x, 128, use_bn, 2, bn_axis, bn_mode)
x = vgg_block(x, 128, use_bn, 2, bn_axis, data_format)
# Block 3
x = vgg_block(x, 256, use_bn, 3, bn_axis, bn_mode)
x = vgg_block(x, 256, use_bn, 3, bn_axis, data_format)
# Block 4
x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode)
x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format)
# Block 5
x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode)
x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format)

# Classification block
x = Flatten(name='flatten')(x)
x = Dense(4096, activation='relu', name='fc1')(x)
x = Dense(4096, activation='relu', name='fc2')(x)
x = Dense(1000, activation='softmax', name='predictions')(x)
x = Dense(units=4096, activation='relu', name='fc1')(x)
x = Dense(units=4096, activation='relu', name='fc2')(x)
x = Dense(units=1000, activation='softmax', name='predictions')(x)

# Create model
model = Model(img_input, x)
model = Model(inputs=img_input, outputs=x)
model.summary()

opt = SGD()
model.compile(loss='categorical_crossentropy', optimizer=opt)
Expand Down Expand Up @@ -87,3 +96,67 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
utils.print_module("Keras backend: %s" % K.backend())
utils.print_module("Backend version: %s" % backend.__version__)
utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))


def run_SimpleCNN(batch_size):

# Determine proper input shape
if K.image_dim_ordering() == 'th':
input_shape = (3, 32, 32)
else:
input_shape = (32, 32, 3)

img_input = Input(shape=input_shape)
# Block 1
x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(img_input)
x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(x)
x = MaxPooling2D()(x)
# Block 2
x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x)
x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x)
x = MaxPooling2D()(x)
# Dense part
x = Flatten(name='flatten')(x)
x = Dense(512,activation="relu")(x)
x = Dense(10,activation="relu")(x)

# Create model
model = Model(img_input, x)

opt = SGD()
model.compile(loss='categorical_crossentropy', optimizer=opt)

# Input data
data = cifar10.load_data()
X, Y = data[0][0], data[0][1]
Y = np_utils.to_categorical(Y, nb_classes=10)

# warmup
model.train_on_batch(X[:32], Y[:32])

# Split in chunks of size batch size
num_elem = X.shape[0]
chunk_size = batch_size
num_chunks = num_elem / chunk_size
list_chunks = np.array_split(np.arange(num_elem), num_chunks)

for e in range(10):
t0 = time()
for index, chunk_idx in enumerate(list_chunks):
X_batch, Y_batch = X[chunk_idx], Y[chunk_idx]
model.train_on_batch(X_batch, Y_batch)
t1 = time()

print t1 - t0

# Import backend to get version number
if K.backend() == "tensorflow":
import tensorflow as backend
elif K.backend() == "theano":
import theano as backend

# Print summary
utils.print_module("Keras version: %s" % keras.__version__)
utils.print_module("Keras backend: %s" % K.backend())
utils.print_module("Backend version: %s" % backend.__version__)
utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
143 changes: 143 additions & 0 deletions benchmark_pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import time
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.datasets as dset
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.models import vgg
import utils


class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()

self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=True)
self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1, bias=True)
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True)
self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=True)

self.fc1 = nn.Linear(4096, 512)
self.fc2 = nn.Linear(512, 10)

self._initialize_weights()

def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()

def forward(self, x):

# First conv block
out = F.relu(self.conv1(x))
out = F.relu(self.conv2(out))
out = F.max_pool2d(out, 2)

# Second conv block
out = F.relu(self.conv3(out))
out = F.relu(self.conv4(out))
out = F.max_pool2d(out, 2)

# Flatten
out = out.view(out.size(0), -1)

# Linear
out = F.relu(self.fc1(out))
out = F.log_softmax(self.fc2(out))

return out


def run_SimpleCNN(batch_size, nb_epoch):

normMean = [0.49139968, 0.48215827, 0.44653124]
normStd = [0.24703233, 0.24348505, 0.26158768]
normTransform = transforms.Normalize(normMean, normStd)

list_transforms = [transforms.ToTensor(), normTransform]
trainTransform = transforms.Compose(list_transforms)

kwargs = {'num_workers': 0, 'pin_memory': True}
dataset = dset.CIFAR10(root='cifar', train=True, download=True, transform=trainTransform)
trainLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs)

net = SimpleCNN()

net = net.cuda()

optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4)

for epoch in range(nb_epoch):

s = time.time()

for batch_idx, (data, target) in enumerate(trainLoader):
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = net(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()

print time.time() - s


def run_VGG16(batch_size, n_trials):

# Initialize network
net = vgg.vgg16()
net.cuda()

# Loss and optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Data
n_classes = 1000
labels = np.random.randint(0, 1000, batch_size * n_trials).astype(np.uint8).tolist()
labels = torch.LongTensor(labels)
inputs = torch.randn(batch_size * n_trials, 3, 224, 224)

dataset = torch.utils.data.TensorDataset(inputs, labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True)

t0 = time.time()
n = 0
for i, (X, y) in enumerate(dataloader):

ll = Variable(y.cuda(async=True))
inp = Variable(X.cuda(async=True))

# forward pass
outputs = net(inp)

# compute loss
loss = criterion(outputs, ll)

# zero the parameter gradients
optimizer.zero_grad()

loss.backward()
optimizer.step()

n += 1

t1 = time.time()

# Print summary
utils.print_module("pytorch version: %s" % torch.__version__)
utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
2 changes: 2 additions & 0 deletions benchmark_tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def linear(x, n_in, n_out, bias=True, scope=None):

class Vgg16Model():
""" VGG16 model adapted from https://github.com/machrisaa/tensorflow-vgg"""

def __init__(self, data_format="NCHW", use_bn=False, use_fused=False):
self.image_mean = np.array([103.939, 116.779, 123.68])
self.data_format = data_format
Expand Down Expand Up @@ -159,6 +160,7 @@ def run_VGG16(batch_size, n_trials, data_format="NHWC", use_XLA=False, use_bn=Fa

t0 = time.time()
for i in range(n_trials):
t = time.time()
sess.run([apply_gradient_op])
t1 = time.time()

Expand Down
25 changes: 22 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
description='Benchmarks largely inspired by https://github.com/aizvorski/vgg-benchmarks')
parser.add_argument('--run_keras', action="store_true", help="Run keras benchmark")
parser.add_argument('--run_tensorflow', action="store_true", help="Run pure tensorflow benchmark")
parser.add_argument('--run_pytorch', action="store_true", help="Run pytorch benchmark")
parser.add_argument('--batch_size', default=16, type=int, help="Batch size")
parser.add_argument('--n_trials', default=100, type=int,
help="Number of full iterations (forward + backward + update)")
parser.add_argument('--use_XLA', action="store_true", help="Whether to use XLA compiler")
parser.add_argument('--data_format', default="NHWC", type=str, help="Tensorflow image format")
parser.add_argument('--data_format', default="NCHW", type=str, help="Image format")
parser.add_argument('--use_bn', action="store_true",help="Use batch normalization (tf benchmark)")
parser.add_argument('--use_fused', action="store_true",help="Use fused batch normalization (tf benchmark)")
parser.add_argument('--bn_mode', default=2, type=int, help="keras batch norm mode")

args = parser.parse_args()

Expand All @@ -33,7 +33,12 @@
benchmark_keras.run_VGG16(args.batch_size,
args.n_trials,
args.use_bn,
args.bn_mode)
args.data_format)

# import benchmark_keras
# utils.print_module("Running %s..." % benchmark_keras.__name__)
# utils.print_dict(args.__dict__)
# benchmark_keras.run_SimpleCNN(args.batch_size)

if args.run_tensorflow:
import benchmark_tensorflow
Expand All @@ -45,3 +50,17 @@
args.use_XLA,
args.use_bn,
args.use_fused)

if args.run_pytorch:
import benchmark_pytorch

utils.print_module("Running %s..." % benchmark_pytorch.__name__)
utils.print_dict(args.__dict__)
benchmark_pytorch.run_VGG16(args.batch_size,
args.n_trials,)


# utils.print_module("Running %s..." % benchmark_pytorch.__name__)
# utils.print_dict(args.__dict__)
# benchmark_pytorch.run_SimpleCNN(args.batch_size,
# args.n_trials,)

0 comments on commit 9223f8a

Please sign in to comment.