I got this error when training tfhub pretrained embeddings keraslayer
Epoch 1/10
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
<ipython-input-7-e9085b1a50d7> in <cell line: 1>()
----> 1 history = model.fit(train_ds,batch_size=BATCH_SIZE,steps_per_epoch=train_steps,epochs=10,validation_data=valid_ds,validation_steps=valid_steps)
1 frames
/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py in error_handler(*args, **kwargs)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
/usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1107 return self._numpy_internal()
1108 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1109 raise core._status_to_exception(e) from None # pylint: disable=protected-access
1110
1111 @property
InternalError: RET_CHECK failure (third_party/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc:2008) arg_shape.handle_type != DT_INVALID input edge: [id=6646 Func/while/body/_1/input/_1330:0 -> while/cluster_while_body_146058:634]
My full code:
!pip3 install -q -U tensorflow-text
from IPython.display import clear_output
import tensorflow as tf
import numpy as np
from google.colab import auth
auth.authenticate_user()
import os
import tensorflow_datasets as tfds
import tensorflow_hub as hub
from tensorflow import keras
import tensorflow_text as text
import os
tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver("grpc://"+os.environ["COLAB_TPU_ADDR"])
tf.config.experimental_connect_to_cluster(tpu_resolver)
tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
strategy = tf.distribute.TPUStrategy(tpu_resolver)
(train_raw, valid_raw),ds_info = tfds.load(
name="imdb_reviews",
split=["train", "test"],
as_supervised=True,
try_gcs=True,
with_info=True
)
BATCH_SIZE = 16 * 8
train_size = ds_info.splits['train'].num_examples # 25000
valid_size = ds_info.splits['test'].num_examples # 25000
train_steps = train_size // BATCH_SIZE
valid_steps = valid_size // BATCH_SIZE
train_ds = train_raw.shuffle(8000)
train_ds = train_ds.repeat()
train_ds = train_ds.batch(BATCH_SIZE,drop_remainder=True)
train_ds = train_ds.prefetch(-1)
valid_ds = valid_raw.batch(BATCH_SIZE,drop_remainder=True)
valid_ds = valid_ds.prefetch(-1)
with strategy.scope():
load_locally = tf.saved_model.LoadOptions(experimental_io_device="/job:localhost")
inp_ = keras.layers.Input(shape=[],dtype=tf.string)
z = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2",load_options=load_locally)(inp_)
z = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base-br/1",trainable=True,load_options=load_locally)(z)
z = keras.layers.Lambda(lambda z: z['default'])(z)
z = keras.layers.Flatten()(z)
z = keras.layers.Dense(64,"relu")(z)
out_ = keras.layers.Dense(1,"sigmoid")(z)
model = keras.models.Model(inputs=[inp_],outputs=[out_])
model.compile(loss="binary_crossentropy", optimizer="nadam",metrics=["accuracy"],steps_per_execution=20)
model.fit(train_ds,steps_per_epoch=train_steps,epochs=10,validation_data=valid_ds,validation_steps=valid_steps)
I had scourged stackoverflow for the same problem, I got across one post which said their problem was solved by changing the steps_per_epoch. I changed the steps, decreased and increased but I got the same thing again and again.
found the answer. TPU's are unable to process tf.strings, so you need to add the preprocessing of inputs in the cpu of tpu that is outside of model and inside of scope of strategy.