tensorflow neural-network deep-learning recurrent-neural-network handwriting-recognition

TensorFlow: No decrease in CTC loss while training BLSTM

I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.

I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step

I am new to TensorFlow so could have made some naive error. The code used:

class CRNN(object):

def __init__(self, config):

    self.config = config
    tf.reset_default_graph()

def read_and_decode(self, filename_queue):

    reader = tf.TFRecordReader()

    _, serialized_example = reader.read(filename_queue)

    # Define how to parse the example
    context_features = {
        'length': tf.FixedLenFeature([], dtype=tf.int64),
        'out_length': tf.FixedLenFeature([], dtype=tf.int64)
    }
    sequence_features = {
        'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
        'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }

    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features)

    image = sequence_parsed['token']
    label = tf.cast(sequence_parsed['labels'], tf.int32)
    length = tf.cast(context_parsed['length'], tf.int32)
    lab_length = tf.cast(context_parsed['out_length'], tf.int32)

    image_shape = tf.cast(tf.stack([self.config.im_height, 
                                    length/self.config.im_height]), tf.int32)
    image = tf.reshape(image, image_shape)

    # Updating length to represent image width
    length = tf.shape(image)[1]

    # Batch the variable length tensor with dynamic padding
    self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
        tensors=[image, label, length, lab_length],
        batch_size=self.config.batch_size, dynamic_pad=True)

def net(self):


    batch_lab_length = tf.reduce_max(self.lab_lengths)
    batch_im_length = tf.reduce_max(self.lengths)

    # Reshape to time major
    sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
                                            self.config.im_height])

    # Feed sequences into RNN
    with tf.name_scope('RNN'):
        self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
                                       state_is_tuple=True)
        self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
                                       state_is_tuple=True)
        self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=self.cell_fw,
            cell_bw=self.cell_bw,
            inputs=sequences,
            dtype=tf.float32,
            sequence_length=self.lengths,
            time_major=True,
            scope='RNN'
        )

        # Reshaping to apply the same weights over the timesteps
        self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])

        self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
                                                 self.config.num_classes],
                                                stddev=0.1), name='out_W')
        self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')

        # Doing the affine projection
        logits = tf.matmul(self.output, self.out_W) + self.out_b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    # Training computation

    # Prepare sparse tensor for CTC loss
    labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
    sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))

    labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
    mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
    labels_vals = tf.boolean_mask(labels_vals,mask)

    labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals, 
                                    dense_shape=[self.config.batch_size, 
                                                 tf.cast(batch_lab_length, tf.int64)])
    self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths, 
                          preprocess_collapse_repeated=False, ctc_merge_repeated=False, 
                          time_major=True)
    self.cost = tf.reduce_mean(self.loss)

    # Optimizer
    self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
                                           momentum=0.9, use_nesterov=True).minimize(self.cost)

    # Predictions for the training, validation, and test data.
    self.train_prediction = tf.nn.ctc_beam_search_decoder(logits, 
                                                sequence_length=self.lab_lengths)


def train(self):
    num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
    tf.reset_default_graph()

    filename_queue = tf.train.string_input_producer(
                    [self.config.tfrecord_filename], num_epochs=self.config.num_epochs)

    self.read_and_decode(filename_queue)
    self.net()

    # The op for initializing the variables.
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    saver = tf.train.Saver()

    with tf.Session() as sess:

        training_summary = tf.summary.scalar("training_cost", self.cost)
        writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)

        sess.run(init_op)
        print('Initialized')
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        start = time.time()
        steps_time = start

        epoch = 1
        for step in range(num_steps):
            _, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
                                                                     self.train_prediction, 
                                                                     self.labels, training_summary])
            writer.add_summary(train_summ, step) 


            if (step % 10000 == 0):
                preds = np.zeros((predictions[0][0].dense_shape))
                i =  0
                for idx in predictions[0][0].indices:
                    preds[idx[0]][idx[1]] = predictions[0][0].values[i]
                    i+=1
                print(time.time() - steps_time)
                steps_time = time.time()
                print('Minibatch cost at step %d: %f' % (step, c))
                print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels], 
                      'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])

            if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
                print('Epoch', epoch, 'Completed')
                epoch+=1

            last_step = step
        saver.save(sess, "model_BLSTM", global_step=last_step)
        writer.close()
        print(time.time() - start)

Solution

After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'

More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time

Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.