I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)
After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.