tensorflowkerastf.kerastpu

Using pretrained embeddings on TPU


I got this error when training tfhub pretrained embeddings keraslayer


Epoch 1/10
---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
<ipython-input-7-e9085b1a50d7> in <cell line: 1>()
----> 1 history = model.fit(train_ds,batch_size=BATCH_SIZE,steps_per_epoch=train_steps,epochs=10,validation_data=valid_ds,validation_steps=valid_steps)

1 frames
/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py in error_handler(*args, **kwargs)
     68             # To get the full stack trace, call:
     69             # `tf.debugging.disable_traceback_filtering()`
---> 70             raise e.with_traceback(filtered_tb) from None
     71         finally:
     72             del filtered_tb

/usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
   1107       return self._numpy_internal()
   1108     except core._NotOkStatusException as e:  # pylint: disable=protected-access
-> 1109       raise core._status_to_exception(e) from None  # pylint: disable=protected-access
   1110 
   1111   @property

InternalError: RET_CHECK failure (third_party/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc:2008) arg_shape.handle_type != DT_INVALID  input edge: [id=6646 Func/while/body/_1/input/_1330:0 -> while/cluster_while_body_146058:634]

My full code:

!pip3 install -q -U tensorflow-text
from IPython.display import clear_output
import tensorflow as tf
import numpy as np
from google.colab import auth
auth.authenticate_user()
import os
import tensorflow_datasets as tfds
import tensorflow_hub as hub
from tensorflow import keras
import tensorflow_text as text
import os

tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver("grpc://"+os.environ["COLAB_TPU_ADDR"])
tf.config.experimental_connect_to_cluster(tpu_resolver)
tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
strategy = tf.distribute.TPUStrategy(tpu_resolver)

(train_raw, valid_raw),ds_info = tfds.load(
    name="imdb_reviews",
    split=["train", "test"],
    as_supervised=True,
    try_gcs=True,
    with_info=True
)


BATCH_SIZE = 16 * 8
train_size = ds_info.splits['train'].num_examples # 25000
valid_size = ds_info.splits['test'].num_examples # 25000
train_steps = train_size // BATCH_SIZE
valid_steps = valid_size // BATCH_SIZE


train_ds = train_raw.shuffle(8000)
train_ds = train_ds.repeat()
train_ds = train_ds.batch(BATCH_SIZE,drop_remainder=True)
train_ds = train_ds.prefetch(-1)
valid_ds = valid_raw.batch(BATCH_SIZE,drop_remainder=True)
valid_ds = valid_ds.prefetch(-1)

with strategy.scope():
    load_locally = tf.saved_model.LoadOptions(experimental_io_device="/job:localhost")
    inp_ = keras.layers.Input(shape=[],dtype=tf.string)
    z = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2",load_options=load_locally)(inp_)
    z = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base-br/1",trainable=True,load_options=load_locally)(z)
    z = keras.layers.Lambda(lambda z: z['default'])(z)
    z = keras.layers.Flatten()(z)
    z = keras.layers.Dense(64,"relu")(z)
    out_ = keras.layers.Dense(1,"sigmoid")(z)
    model = keras.models.Model(inputs=[inp_],outputs=[out_])
    model.compile(loss="binary_crossentropy", optimizer="nadam",metrics=["accuracy"],steps_per_execution=20)

model.fit(train_ds,steps_per_epoch=train_steps,epochs=10,validation_data=valid_ds,validation_steps=valid_steps)

I had scourged stackoverflow for the same problem, I got across one post which said their problem was solved by changing the steps_per_epoch. I changed the steps, decreased and increased but I got the same thing again and again.


Solution

  • found the answer. TPU's are unable to process tf.strings, so you need to add the preprocessing of inputs in the cpu of tpu that is outside of model and inside of scope of strategy.