diff --git a/README.md b/README.md
index 5524be9..c1e8c57 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,8 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip
 
 | Framework | Time <sup>[1](#foottime)</sup>|
 |:---|:---|
-| Keras (Theano backend) BN mode 0 <sup>[3](#footBNmode)</sup>| 347.546 ms|
-| Keras (Theano backend) BN mode 2 | 269.074 ms|
+| Keras (Theano backend) + BN| 347.546 ms|
 | Keras (TensorFlow backend) mode 0 | 560.938 ms|
-| Keras (TensorFlow backend) mode 2 | 504.966 ms|
 |Tensorflow NHWC + BN no XLA| 493.235 ms|
 |Tensorflow NHWC + BN + XLA| 341.702 ms|
 |Tensorflow NHWC + fused BN no XLA| 395.963 ms|
@@ -38,8 +36,6 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip
 
 <a name="kerasnote">2</a>: Note that at the moment, keras uses traditional NHWC tensorflow ordering
 
-<a name="footBNmode">3</a>: Mode 0 = use per-batch statistics to normalize the data, and during testing use running averages computed during the training phase. Mode 2 = use per-batch statistics to normalize the data during training and testing.
-
 ### System specs
 
 - Ubuntu 14.04
@@ -47,7 +43,7 @@ We tried to get the most out of each framework (GPU util is at 99% for all scrip
 - cuDNN 5.1.10
 - theano '0.9.0beta1.dev-173eef98360c23d7418bad3a36f5fb938724f05f' (cuda backend)
 - tensorflow 1.0.0 (compiled from source with CUDA 8.0 cuDNN 5.1.10 and XLA JIT)
-- Keras 1.2.2
+- Keras 2.0.1
 
 ## Usage
 
diff --git a/benchmark_keras.py b/benchmark_keras.py
index bbbd84e..26c6dc2 100644
--- a/benchmark_keras.py
+++ b/benchmark_keras.py
@@ -4,27 +4,30 @@
 import keras.backend as K
 from keras.models import Model
 from keras.layers import Flatten, Dense, Input
-from keras.layers import Convolution2D, MaxPooling2D
+from keras.layers.convolutional import Conv2D
+from keras.layers.pooling import MaxPooling2D
 from keras.optimizers import SGD
 from keras.layers import Activation
 from keras.layers.normalization import BatchNormalization
+from keras.datasets import cifar10
 import utils
+from keras.utils import np_utils
 
 
-def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, bn_mode):
+def vgg_block(x, nb_filters, use_bn, nb_conv, bn_axis, data_format):
 
     for i in range(nb_conv):
-        x = Convolution2D(nb_filters, 3, 3, border_mode='same')(x)
+        x = Conv2D(filters=nb_filters, kernel_size=(3, 3), padding='same', data_format=data_format)(x)
         if use_bn:
-            x = BatchNormalization(mode=bn_mode, axis=bn_axis)(x)
+            x = BatchNormalization(scale=False, axis=bn_axis)(x)
         x = Activation("relu")(x)
 
-    x = MaxPooling2D((2, 2), strides=(2, 2))(x)
+    x = MaxPooling2D((2, 2), strides=(2, 2), padding='valid', data_format=data_format)(x)
 
     return x
 
 
-def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
+def run_VGG16(batch_size=16, n_trials=100, use_bn=False, data_format="NCHW"):
     """Run VGG16 experiment
 
     Args:
@@ -33,33 +36,39 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
     """
 
     # Determine proper input shape
-    if K.image_dim_ordering() == 'th':
+    if data_format == "NCHW":
+        assert K.image_data_format() == 'channels_first', "Change your keras.json file"
+        # Update NCHW to channels_first (keras conventions)
+        data_format = "channels_first"
         input_shape = (3, 224, 224)
         bn_axis = 1
     else:
+        assert K.image_data_format() == 'channels_last', "Change your keras.json file"
+        data_format = "channels_last"
         input_shape = (224, 224, 3)
         bn_axis = -1
 
     img_input = Input(shape=input_shape)
     # Block 1
-    x = vgg_block(img_input, 64, use_bn, 2, bn_axis, bn_mode)
+    x = vgg_block(img_input, 64, use_bn, 2, bn_axis, data_format)
     # Block 2
-    x = vgg_block(x, 128, use_bn, 2, bn_axis, bn_mode)
+    x = vgg_block(x, 128, use_bn, 2, bn_axis, data_format)
     # Block 3
-    x = vgg_block(x, 256, use_bn, 3, bn_axis, bn_mode)
+    x = vgg_block(x, 256, use_bn, 3, bn_axis, data_format)
     # Block 4
-    x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode)
+    x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format)
     # Block 5
-    x = vgg_block(x, 512, use_bn, 3, bn_axis, bn_mode)
+    x = vgg_block(x, 512, use_bn, 3, bn_axis, data_format)
 
     # Classification block
     x = Flatten(name='flatten')(x)
-    x = Dense(4096, activation='relu', name='fc1')(x)
-    x = Dense(4096, activation='relu', name='fc2')(x)
-    x = Dense(1000, activation='softmax', name='predictions')(x)
+    x = Dense(units=4096, activation='relu', name='fc1')(x)
+    x = Dense(units=4096, activation='relu', name='fc2')(x)
+    x = Dense(units=1000, activation='softmax', name='predictions')(x)
 
     # Create model
-    model = Model(img_input, x)
+    model = Model(inputs=img_input, outputs=x)
+    model.summary()
 
     opt = SGD()
     model.compile(loss='categorical_crossentropy', optimizer=opt)
@@ -87,3 +96,67 @@ def run_VGG16(batch_size=16, n_trials=100, use_bn=False, bn_mode=2):
     utils.print_module("Keras backend: %s" % K.backend())
     utils.print_module("Backend version: %s" % backend.__version__)
     utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
+
+
+def run_SimpleCNN(batch_size):
+
+    # Determine proper input shape
+    if K.image_dim_ordering() == 'th':
+        input_shape = (3, 32, 32)
+    else:
+        input_shape = (32, 32, 3)
+
+    img_input = Input(shape=input_shape)
+    # Block 1
+    x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(img_input)
+    x = Conv2D(32, 3, 3, border_mode="same", activation="relu")(x)
+    x = MaxPooling2D()(x)
+    # Block 2
+    x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x)
+    x = Conv2D(64, 3, 3, border_mode="same", activation="relu")(x)
+    x = MaxPooling2D()(x)
+    # Dense part
+    x = Flatten(name='flatten')(x)
+    x = Dense(512,activation="relu")(x)
+    x = Dense(10,activation="relu")(x)
+
+    # Create model
+    model = Model(img_input, x)
+
+    opt = SGD()
+    model.compile(loss='categorical_crossentropy', optimizer=opt)
+
+    # Input data
+    data = cifar10.load_data()
+    X, Y = data[0][0], data[0][1]
+    Y = np_utils.to_categorical(Y, nb_classes=10)
+
+    # warmup
+    model.train_on_batch(X[:32], Y[:32])
+
+    # Split in chunks of size batch size
+    num_elem = X.shape[0]
+    chunk_size = batch_size
+    num_chunks = num_elem / chunk_size
+    list_chunks = np.array_split(np.arange(num_elem), num_chunks)
+
+    for e in range(10):
+        t0 = time()
+        for index, chunk_idx in enumerate(list_chunks):
+            X_batch, Y_batch = X[chunk_idx], Y[chunk_idx]
+            model.train_on_batch(X_batch, Y_batch)
+        t1 = time()
+
+        print t1 - t0
+
+    # Import backend to get version number
+    if K.backend() == "tensorflow":
+        import tensorflow as backend
+    elif K.backend() == "theano":
+        import theano as backend
+
+    # Print summary
+    utils.print_module("Keras version: %s" % keras.__version__)
+    utils.print_module("Keras backend: %s" % K.backend())
+    utils.print_module("Backend version: %s" % backend.__version__)
+    utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
diff --git a/benchmark_pytorch.py b/benchmark_pytorch.py
new file mode 100644
index 0000000..df97de7
--- /dev/null
+++ b/benchmark_pytorch.py
@@ -0,0 +1,143 @@
+import time
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import torchvision.datasets as dset
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+from torchvision.models import vgg
+import utils
+
+
+class SimpleCNN(nn.Module):
+    def __init__(self):
+        super(SimpleCNN, self).__init__()
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=True)
+        self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1, bias=True)
+        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True)
+        self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=True)
+
+        self.fc1 = nn.Linear(4096, 512)
+        self.fc2 = nn.Linear(512, 10)
+
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+
+        # First conv block
+        out = F.relu(self.conv1(x))
+        out = F.relu(self.conv2(out))
+        out = F.max_pool2d(out, 2)
+
+        # Second conv block
+        out = F.relu(self.conv3(out))
+        out = F.relu(self.conv4(out))
+        out = F.max_pool2d(out, 2)
+
+        # Flatten
+        out = out.view(out.size(0), -1)
+
+        # Linear
+        out = F.relu(self.fc1(out))
+        out = F.log_softmax(self.fc2(out))
+
+        return out
+
+
+def run_SimpleCNN(batch_size, nb_epoch):
+
+    normMean = [0.49139968, 0.48215827, 0.44653124]
+    normStd = [0.24703233, 0.24348505, 0.26158768]
+    normTransform = transforms.Normalize(normMean, normStd)
+
+    list_transforms = [transforms.ToTensor(), normTransform]
+    trainTransform = transforms.Compose(list_transforms)
+
+    kwargs = {'num_workers': 0, 'pin_memory': True}
+    dataset = dset.CIFAR10(root='cifar', train=True, download=True, transform=trainTransform)
+    trainLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs)
+
+    net = SimpleCNN()
+
+    net = net.cuda()
+
+    optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4)
+
+    for epoch in range(nb_epoch):
+
+        s = time.time()
+
+        for batch_idx, (data, target) in enumerate(trainLoader):
+            data, target = data.cuda(), target.cuda()
+            data, target = Variable(data), Variable(target)
+            optimizer.zero_grad()
+            output = net(data)
+            loss = F.nll_loss(output, target)
+            loss.backward()
+            optimizer.step()
+
+        print time.time() - s
+
+
+def run_VGG16(batch_size, n_trials):
+
+    # Initialize network
+    net = vgg.vgg16()
+    net.cuda()
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss().cuda()
+    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+    # Data
+    n_classes = 1000
+    labels = np.random.randint(0, 1000, batch_size * n_trials).astype(np.uint8).tolist()
+    labels = torch.LongTensor(labels)
+    inputs = torch.randn(batch_size * n_trials, 3, 224, 224)
+
+    dataset = torch.utils.data.TensorDataset(inputs, labels)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True)
+
+    t0 = time.time()
+    n = 0
+    for i, (X, y) in enumerate(dataloader):
+
+        ll = Variable(y.cuda(async=True))
+        inp = Variable(X.cuda(async=True))
+
+        # forward pass
+        outputs = net(inp)
+
+        # compute loss
+        loss = criterion(outputs, ll)
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        loss.backward()
+        optimizer.step()
+
+        n += 1
+
+    t1 = time.time()
+
+    # Print summary
+    utils.print_module("pytorch version: %s" % torch.__version__)
+    utils.print_result("%7.3f ms." % (1000. * (t1 - t0) / n_trials))
\ No newline at end of file
diff --git a/benchmark_tensorflow.py b/benchmark_tensorflow.py
index cf081d4..bc4efa5 100644
--- a/benchmark_tensorflow.py
+++ b/benchmark_tensorflow.py
@@ -37,6 +37,7 @@ def linear(x, n_in, n_out, bias=True, scope=None):
 
 class Vgg16Model():
     """ VGG16 model adapted from https://github.com/machrisaa/tensorflow-vgg"""
+
     def __init__(self, data_format="NCHW", use_bn=False, use_fused=False):
         self.image_mean = np.array([103.939, 116.779, 123.68])
         self.data_format = data_format
@@ -159,6 +160,7 @@ def run_VGG16(batch_size, n_trials, data_format="NHWC", use_XLA=False, use_bn=Fa
 
             t0 = time.time()
             for i in range(n_trials):
+                t = time.time()
                 sess.run([apply_gradient_op])
             t1 = time.time()
 
diff --git a/main.py b/main.py
index 0dc0188..fa4d6b6 100644
--- a/main.py
+++ b/main.py
@@ -13,14 +13,14 @@
         description='Benchmarks largely inspired by https://github.com/aizvorski/vgg-benchmarks')
     parser.add_argument('--run_keras', action="store_true", help="Run keras benchmark")
     parser.add_argument('--run_tensorflow', action="store_true", help="Run pure tensorflow benchmark")
+    parser.add_argument('--run_pytorch', action="store_true", help="Run pytorch benchmark")
     parser.add_argument('--batch_size', default=16, type=int, help="Batch size")
     parser.add_argument('--n_trials', default=100, type=int,
                         help="Number of full iterations (forward + backward + update)")
     parser.add_argument('--use_XLA', action="store_true", help="Whether to use XLA compiler")
-    parser.add_argument('--data_format', default="NHWC", type=str, help="Tensorflow image format")
+    parser.add_argument('--data_format', default="NCHW", type=str, help="Image format")
     parser.add_argument('--use_bn', action="store_true",help="Use batch normalization (tf benchmark)")
     parser.add_argument('--use_fused', action="store_true",help="Use fused batch normalization (tf benchmark)")
-    parser.add_argument('--bn_mode', default=2, type=int, help="keras batch norm mode")
 
     args = parser.parse_args()
 
@@ -33,7 +33,12 @@
         benchmark_keras.run_VGG16(args.batch_size,
                                   args.n_trials,
                                   args.use_bn,
-                                  args.bn_mode)
+                                  args.data_format)
+
+        # import benchmark_keras
+        # utils.print_module("Running %s..." % benchmark_keras.__name__)
+        # utils.print_dict(args.__dict__)
+        # benchmark_keras.run_SimpleCNN(args.batch_size)
 
     if args.run_tensorflow:
         import benchmark_tensorflow
@@ -45,3 +50,17 @@
                                        args.use_XLA,
                                        args.use_bn,
                                        args.use_fused)
+
+    if args.run_pytorch:
+        import benchmark_pytorch
+
+        utils.print_module("Running %s..." % benchmark_pytorch.__name__)
+        utils.print_dict(args.__dict__)
+        benchmark_pytorch.run_VGG16(args.batch_size,
+                                        args.n_trials,)
+
+
+        # utils.print_module("Running %s..." % benchmark_pytorch.__name__)
+        # utils.print_dict(args.__dict__)
+        # benchmark_pytorch.run_SimpleCNN(args.batch_size,
+        #                                 args.n_trials,)