I am creating Google Colabs for each talk I found interesting from the Tensorflow 2020 Summit. As a note, I am using Tensorflow 2.1.
I have encountered a problem when attempting to implement the 'Learning To Read With Tensorflow'
talk.
Everything is peachy up until we get to the EncoderDecoder
class definition. When I implement the fit method on my custom Model
subclass I get an error which will be detailed below.
The last salient error is AttributeError: 'NoneType' object has no attribute 'dtype'
.
However, I believe this is due to a problem within the GradientTape
scope code and/or problems with the definition of the Decoder Layers
(including the Attention Layers
)
# Not normally defined here... but doing so for clarity
MAX_VOCAB_SIZE = 5000
WINDOW_LENGTH = 11
class EncoderDecoder(tf.keras.Model):
def __init__(self,
max_features=MAX_VOCAB_SIZE,
output_seq_len=WINDOW_LENGTH-1,
embedding_dims=200,
rnn_units=512):
super().__init__()
self.max_features = max_features
self.output_seq_len = output_seq_len
self.embedding_dims = embedding_dims
self.rnn_units = rnn_units
self.vectorize_layer = \
tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=self.max_features,
standardize='lower_and_strip_punctuation',
split='whitespace',
ngrams=None,
output_mode='int',
output_sequence_length=self.output_seq_len,
pad_to_max_tokens=True)
# --- <ENCODER STUFF> ---
# Embedding
self.encoder_embedding = \
tf.keras.layers.Embedding(input_dim=self.max_features+1,
output_dim=self.embedding_dims)
# ENCODER
self.lstm_layer = \
tf.keras.layers.LSTM(units=self.rnn_units,
return_state=True)
# --- </ENCODER STUFF> ---
# --- <DECODER STUFF> ---
# Embedding
self.decoder_embedding = \
tf.keras.layers.Embedding(input_dim=self.max_features+1,
output_dim=self.embedding_dims)
# ---------------- MAYBE NOT NECESSARY ----------------
# Sampler (for use during training)
# This was not shown during the talk but it is pretty obvious
sampler = tfa.seq2seq.sampler.TrainingSampler()
# This was not shown during the talk but is required...
# This is my best guess
decoder_cell = tf.keras.layers.LSTMCell(units=self.rnn_units)
# ---------------- MAYBE NOT NECESSARY ----------------
# Output Layer For Decoder
self.projection_layer = \
tf.keras.layers.Dense(self.max_features)
# DECODER
self.decoder = \
tfa.seq2seq.BasicDecoder(cell=decoder_cell,
sampler=sampler,
output_layer=self.projection_layer)
# --- </DECODER STUFF> ---
# --- <ATTN STUFF> ---
# Basic dense attention layer to connect Encoder & Decoder
self.attention = tf.keras.layers.Attention()
# --- </ATTN STUFF> ---
def train_step(self, data):
""" Overwrite built-in train_step method
Args:
data (tuple): The example (ten `words`), and the label (one `word`)
Returns:
Metric results for all passed metrics
"""
# Split data into example (x) and label (y)
x, y = data[0], data[1]
# Vectorize the example words (x)
x = self.vectorize_layer(x)
# Vectorize the labels
# This will by default pad the output to 10 ... but we only need the
# first entry (the true label not the useless padding)
y = self.vectorize_layer(y)[:, 0]
# Convert our label into a one-hot encoding based on the max number of
# features that we will be using for our model
y_one_hot = tf.one_hot(y, self.max_features)
# Everything within GradientTape is recorded
# for later automatic differentiation
with tf.GradientTape() as tape:
# --- <ENCODER STUFF> ---
# Transform the example utilizing the encoder embedding
inputs = self.encoder_embedding(x)
# Get the encoder outputs and state by
# utilizing the encoder (lstm_layer)
# - encoder_outputs : [max_time, batch_size, num_units]
# - encoder_state : [state_h, state_c]
# * state_h --- The Hidden State
# * state_c --- The Cell State
encoder_outputs, state_h, state_c = self.lstm_layer(inputs)
# --- </ENCODER STUFF> ---
# --- <ATTN STUFF> ---
# Pass the encoder outputs and hidden state allowing us
# to track the intermediate state coming out of the encoder layers
attn_output = self.attention([encoder_outputs, state_h])
attn_output = tf.expand_dims(attn_output, axis=1)
# --- </ATTN STUFF> ---
# --- <DECODER STUFF> ---
# ??? Create an empty embedding ???
targets = self.decoder_embedding(tf.zeros_like(y))
# Concat the output of the attention layer to the last axis
# of the empty targets embedding
concat_output = tf.concat([targets, attn_output], axis=-1)
# Predict the targets using the state from the encoder
outputs, _, _ = \
self.decoder(concat_output, initial_state=[state_h, state_c])
# --- </DECODER STUFF> ---
# Automatically differeniate utilizing the loss and trainable variables
gradients = tape.gradient(loss, trainable_variables)
# Collect the outputs so that they can be optimized
self.optimizer.apply_gradients(zip(gradients, trainable_variables))
# Update the metric state prior to return
self.compiled_metrics.update_state(y_one_hot, y_pred)
return {m.name: m.result() for m in self.metrics}
model = EncoderDecoder()
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
optimizer="adam",
metrics=["accuracy"])
model.vectorize_layer.adapt(lines.batch(256))
# ERROR OCCURS ON THIS LINE
model.fit(data.batch(256),
epochs=45,
callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath='text_gen')])
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-40-779906f7f617> in <module>()
1 model.fit(data.batch(256),
2 epochs=45,
----> 3 callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath='text_gen')])
8 frames
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233 max_queue_size=max_queue_size,
234 workers=workers,
--> 235 use_multiprocessing=use_multiprocessing)
236
237 total_samples = _get_total_number_of_samples(training_data_adapter)
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
591 max_queue_size=max_queue_size,
592 workers=workers,
--> 593 use_multiprocessing=use_multiprocessing)
594 val_adapter = None
595 if validation_data:
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
704 max_queue_size=max_queue_size,
705 workers=workers,
--> 706 use_multiprocessing=use_multiprocessing)
707
708 return adapter
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, standardize_function, **kwargs)
700
701 if standardize_function is not None:
--> 702 x = standardize_function(x)
703
704 # Note that the dataset instance is immutable, its fine to reusing the user
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in standardize_function(dataset)
658 model.sample_weight_mode = getattr(model, 'sample_weight_mode', None)
659
--> 660 standardize(dataset, extract_tensors_from_dataset=False)
661
662 # Then we map using only the tensor standardization portion.
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle, extract_tensors_from_dataset)
2358 is_compile_called = False
2359 if not self._is_compiled and self.optimizer:
-> 2360 self._compile_from_inputs(all_inputs, y_input, x, y)
2361 is_compile_called = True
2362
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target)
2578 if training_utils.has_tensors(target):
2579 target = training_utils.cast_if_floating_dtype_and_mismatch(
-> 2580 target, self.outputs)
2581 training_utils.validate_input_types(target, orig_target,
2582 allow_dict=False, field_name='target')
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_utils.py in cast_if_floating_dtype_and_mismatch(targets, outputs)
1334 if tensor_util.is_tensor(targets):
1335 # There is one target, so output[0] should be the only output.
-> 1336 return cast_single_tensor(targets, dtype=outputs[0].dtype)
1337 new_targets = []
1338 for target, out in zip(targets, outputs):
AttributeError: 'NoneType' object has no attribute 'dtype'
data
& lines
Variables If Wishing To ReplicateGet the Data
>>> wget http://www.thespermwhale.com/jaseweston/babi/CBTest.tgz
>>> tar zxvf CBTest.tgz
>>> rm -rf CBTest.tgz
Preprocess The Data
# Load data from a dataset comprising lines
# from one or more text files.
lines = tf.data.TextLineDataset("<path-to>/cbt_train.txt")
# Filter Out Title Lines First
# This simple fn not included in this stackoverflow code
lines = lines.filter(lambda x: not is_title(x))
# Then We Remove All Punctuation
# This simple fn not included in this stackoverflow code
lines = lines.map(lambda x: remove_punc(x))
# Then We Remove All Extra Spaces Created By The Previous FN
# This simple fn not included in this stackoverflow code
lines = lines.map(lambda x: remove_extra_spaces(x))
# Then We Turn All The Uppercase Letters into Lowercase Letters
# This simple fn not included in this stackoverflow code
lines = lines.map(lambda x: make_lower(x))
# Get words from lines
words = lines.map(tf.strings.split)
words = words.unbatch()
# Get wordsets
wordsets = words.batch(11)
# get_example_label is a simple fn to split wordsets into examples and labels
# First ten words are the example and last word is the label
data = wordsets.map(get_example_label)
# Shuffle
data = data.shuffle(1024)
References
Thanks in advance!!
It appears as though Tensorflow has released tutorials detailing all of the demos that were conducted at the Summit.
The result is that you can examine the actual code and determine the differences between theirs and yours. I won't post the differences here because they are more significant than I initially thought
When I contacted Tensorflow they also recommended that I check out the transformer tutorial which goes into detail on how to implement complex encoder-decoder and self-attention networks.