[SOLVED] Why does Chainer Batchnormalization not work well with mnist

Chainer batchnormalization does not work well with my code although batch normalization of tensorflow works. I use the dataset mnist the code below shows.

Using chainer(version=6.1.0), without batchnormalization, the validation accuracy is within 0.97 and 0.98 after 100 epochs whereas with batchnormalization, it is less than 0.80 after 100 epochs.

When I use the same way with tensorflow(version=1.14.0), the validation accuracy is around 0.98 in a both way, with batchnormalization or without batchnormalization.

This is the part of my code. The number of epoch is 100, and its batchsize is 1000. I use the Adam as the optimizer with learning_rate 0.01.

dataset, train data, validation data

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)

x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')

model and condition (chainer)

# Define model
class MyModel(Chain):
    def __init__(self,n_in=784,n_hidden=100,n_out=10):
        initializer = chainer.initializers.HeNormal()
        super().__init__()
        with self.init_scope():
            self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
            self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
            self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
            self.bn=L.BatchNormalization(n_hidden, decay=0.99, eps=0.001)

    def forward(self,x):
        h = F.relu(self.bn(self.l1(x)))
        h = F.relu(self.bn(self.l2(h)))
        return self.l3(h)

model = MyModel()
optimizer = optimizers.Adam()
optimizer.setup(model)       

n_epoch = 100
n_batchsize = 1000

model and condition (tensorflow)

n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10


batch_norm_momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.variance_scaling_initializer()
    my_batch_norm_layer = partial(tf.layers.batch_normalization, 
                                  training=training,
                                  momentum=batch_norm_momentum)

    my_dense_layer = partial(tf.layers.dense,
                             kernel_initializer=he_init)

    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
    bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
    logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

All code I use with chainer

import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from keras.datasets import mnist
import cupy as cp



(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)

x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')


# Define model
class MyModel(Chain):
    def __init__(self,n_in=784,n_hidden=100,n_out=10):
        initializer = chainer.initializers.HeNormal()
        super().__init__()
        with self.init_scope():
            self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
            self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
            self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
            self.bn=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)

    def forward(self,x):
        h = F.relu(self.bn(self.l1(x)))
        h = F.relu(self.bn(self.l2(h)))
        return self.l3(h)

# define optimizer
model = MyModel()
optimizer = optimizers.Adam(alpha=0.01)
optimizer.setup(model)       

## learn network
n_epoch = 100
n_batchsize = 1000 
iteration = 0

gpu_id = 0
cuda.get_device(gpu_id).use()

# send the network to gpu memory
model.to_gpu(gpu_id)

print("epoch  train/loss  val/loss  train/acc   val/acc")

for epoch in range(n_epoch):
    # order dataset randomly
    order = np.random.permutation(range(len(x_train)))

    loss_list = []
    accuracy_list = []

    for i in range(0, len(order), n_batchsize):
        index = order[i:i+n_batchsize]
        x_train_batch = x_train[index,:]
        y_train_batch = y_train[index]

        x_train_batch = cp.asarray(x_train_batch)
        y_train_batch = cp.asarray(y_train_batch)

        output_train = model(x_train_batch)

        loss_train_batch = F.softmax_cross_entropy(output_train, y_train_batch)
        accuracy_train_batch = F.accuracy(output_train, y_train_batch)

        loss_list.append(cuda.to_cpu(loss_train_batch.array))
        accuracy_list.append(cuda.to_cpu(accuracy_train_batch.array))

        model.cleargrads()
        loss_train_batch.backward()

        optimizer.update()

        iteration += 1

    loss_train = np.mean(loss_list)
    accuracy_train = np.mean(accuracy_list)

    # after one epoch, evaluate with validation data
    x_val = cp.asarray(x_val)
    y_val = cp.asarray(y_val)
    with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
        output_val = model(x_val)

    loss_val = F.softmax_cross_entropy(output_val, y_val)
    loss_val = cuda.to_cpu(loss_val.array)
    accuracy_val = F.accuracy(output_val, y_val)
    accuracy_val = cuda.to_cpu(accuracy_val.array)

    print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'.format(epoch,loss_train,loss_val,accuracy_train,accuracy_val))

All code I use with tensorflow

python
import tensorflow as tf
from keras.datasets import mnist
from functools import partial
import numpy as np

def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)

X_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
X_valid = test_images.astype('float32')/255
y_valid = test_labels.astype('int32')

n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10

batch_norm_momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.variance_scaling_initializer()
    my_batch_norm_layer = partial(tf.layers.batch_normalization, 
                                  training=training,
                                  momentum=batch_norm_momentum)

    my_dense_layer = partial(tf.layers.dense)

    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
    bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
    logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 100
batch_size = 1000

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

print("epoch  train/loss  val/loss  train/acc   val/acc")

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        loss_list = []
        accuracy_list = []

        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run([training_op, extra_update_ops], 
                     feed_dict={training: True, X: X_batch, y: y_batch})

            loss_batch = loss.eval(feed_dict={X: X_batch, y: y_batch})
            accuracy_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})

            loss_list.append(loss_batch)
            accuracy_list.append(accuracy_batch)

        loss_val = loss.eval(feed_dict={X: X_valid, y: y_valid}) 
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})

        print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'
              .format(epoch,np.mean(loss_list),loss_val,np.mean(accuracy_list),accuracy_val))

I expect batch normalization with chainer would be around 98% but it got less than 80%. Am I using the batchnormalization with chainer in a wrong way, or the structure of batchnormalization differs a lot between chainer and tensorflow??

Why does Chainer Batchnormalization not work well with mnist_nn