Chainer batchnormalization does not work well with my code although batch normalization of tensorflow works. I use the dataset mnist the code below shows.
Using chainer(version=6.1.0), without batchnormalization, the validation accuracy is within 0.97 and 0.98 after 100 epochs whereas with batchnormalization, it is less than 0.80 after 100 epochs.
When I use the same way with tensorflow(version=1.14.0), the validation accuracy is around 0.98 in a both way, with batchnormalization or without batchnormalization.
This is the part of my code. The number of epoch is 100, and its batchsize is 1000. I use the Adam as the optimizer with learning_rate 0.01.
dataset, train data, validation data
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')
model and condition (chainer)
# Define model
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn=L.BatchNormalization(n_hidden, decay=0.99, eps=0.001)
def forward(self,x):
h = F.relu(self.bn(self.l1(x)))
h = F.relu(self.bn(self.l2(h)))
return self.l3(h)
model = MyModel()
optimizer = optimizers.Adam()
optimizer.setup(model)
n_epoch = 100
n_batchsize = 1000
model and condition (tensorflow)
n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(tf.layers.batch_normalization,
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(tf.layers.dense,
kernel_initializer=he_init)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
All code I use with chainer
import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from keras.datasets import mnist
import cupy as cp
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')
# Define model
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
def forward(self,x):
h = F.relu(self.bn(self.l1(x)))
h = F.relu(self.bn(self.l2(h)))
return self.l3(h)
# define optimizer
model = MyModel()
optimizer = optimizers.Adam(alpha=0.01)
optimizer.setup(model)
## learn network
n_epoch = 100
n_batchsize = 1000
iteration = 0
gpu_id = 0
cuda.get_device(gpu_id).use()
# send the network to gpu memory
model.to_gpu(gpu_id)
print("epoch train/loss val/loss train/acc val/acc")
for epoch in range(n_epoch):
# order dataset randomly
order = np.random.permutation(range(len(x_train)))
loss_list = []
accuracy_list = []
for i in range(0, len(order), n_batchsize):
index = order[i:i+n_batchsize]
x_train_batch = x_train[index,:]
y_train_batch = y_train[index]
x_train_batch = cp.asarray(x_train_batch)
y_train_batch = cp.asarray(y_train_batch)
output_train = model(x_train_batch)
loss_train_batch = F.softmax_cross_entropy(output_train, y_train_batch)
accuracy_train_batch = F.accuracy(output_train, y_train_batch)
loss_list.append(cuda.to_cpu(loss_train_batch.array))
accuracy_list.append(cuda.to_cpu(accuracy_train_batch.array))
model.cleargrads()
loss_train_batch.backward()
optimizer.update()
iteration += 1
loss_train = np.mean(loss_list)
accuracy_train = np.mean(accuracy_list)
# after one epoch, evaluate with validation data
x_val = cp.asarray(x_val)
y_val = cp.asarray(y_val)
with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
output_val = model(x_val)
loss_val = F.softmax_cross_entropy(output_val, y_val)
loss_val = cuda.to_cpu(loss_val.array)
accuracy_val = F.accuracy(output_val, y_val)
accuracy_val = cuda.to_cpu(accuracy_val.array)
print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'.format(epoch,loss_train,loss_val,accuracy_train,accuracy_val))
All code I use with tensorflow
python
import tensorflow as tf
from keras.datasets import mnist
from functools import partial
import numpy as np
def shuffle_batch(X, y, batch_size):
rnd_idx = np.random.permutation(len(X))
n_batches = len(X) // batch_size
for batch_idx in np.array_split(rnd_idx, n_batches):
X_batch, y_batch = X[batch_idx], y[batch_idx]
yield X_batch, y_batch
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
X_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
X_valid = test_images.astype('float32')/255
y_valid = test_labels.astype('int32')
n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(tf.layers.batch_normalization,
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(tf.layers.dense)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 100
batch_size = 1000
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
print("epoch train/loss val/loss train/acc val/acc")
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
loss_list = []
accuracy_list = []
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run([training_op, extra_update_ops],
feed_dict={training: True, X: X_batch, y: y_batch})
loss_batch = loss.eval(feed_dict={X: X_batch, y: y_batch})
accuracy_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
loss_list.append(loss_batch)
accuracy_list.append(accuracy_batch)
loss_val = loss.eval(feed_dict={X: X_valid, y: y_valid})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'
.format(epoch,np.mean(loss_list),loss_val,np.mean(accuracy_list),accuracy_val))
I expect batch normalization with chainer would be around 98% but it got less than 80%. Am I using the batchnormalization with chainer in a wrong way, or the structure of batchnormalization differs a lot between chainer and tensorflow??
In order to use different batch statistics between the layers, the model definition has to be like the following code, which achieves 98% validation accuracy after 100 epochs in my environment.
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn1=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
self.bn2=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
def forward(self,x):
h = F.relu(self.bn1(self.l1(x)))
h = F.relu(self.bn2(self.l2(h)))
return self.l3(h)