I have a neural network that takes data from a txt file and uses nlp to learn how to speak like a human. But whenever I load Tokenizer and padded_sequences, (which are both needed) they do not correctly import.
I believe that there may be problems with my tensorflow version or configuration but I do have it updated to the latest version. I may need to end up testing my code in a fresh virtual machine to get it working.
Here is my code:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import importlib
if importlib.util.find_spec("tensorflow.keras.preprocessing.text.Tokenizer") is not None:
print("The Tokenizer class has been imported successfully.")
else:
print("The Tokenizer class has not been imported successfully.")
if importlib.util.find_spec("tensorflow.keras.preprocessing.text.pad_sequences") is not None:
print("The pad_sequences class has been imported successfully.")
else:
print("The pad_sequences class has not been imported successfully.")
# Load the text dataset
with open('data.txt', 'r') as f:
data = f.read()
# Split the data into sentences
sentences = data.split('.')
# Create a tokenizer and fit on the sentences
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(sentences)
# Convert the sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(sentences)
# Create input and target sequences
input_sequences = []
target_sequences = []
for sequence in sequences:
for i in range(1, len(sequence)):
input_sequence = sequence[:i]
target_sequence = sequence[i]
input_sequences.append(input_sequence)
target_sequences.append(target_sequence)
# Pad the input sequences
max_sequence_length = max([len(sequence) for sequence in input_sequences])
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length)
# Convert the target sequences to one-hot vectors
one_hot_target_sequences = tf.keras.utils.to_categorical(target_sequences, num_classes=len(tokenizer.word_index)+1)
# Create the neural network
model = tf.keras.Sequential([
tf.keras.layers.Embedding(len(tokenizer.word_index)+1, 100),
tf.keras.layers.LSTM(128),
tf.keras.layers.Dense(len(tokenizer.word_index)+1, activation='softmax')
])
# Train the neural network
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(padded_input_sequences, one_hot_target_sequences, epochs=10)
# Generate text
def generate_text(model, tokenizer, max_sequence_length, start_text):
# Create a sequence of tokens
tokens = tokenizer.texts_to_sequences([start_text])[0]
# Generate text until the end of the sentence is reached or max sequence length is reached
while len(tokens) < max_sequence_length:
# Pad the input sequence
padded_sequence = tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=max_sequence_length)
# Get the probability distribution for the next token
probabilities = model.predict(padded_sequence)[0]
# Choose the next token with the highest probability
next_token = np.argmax(probabilities)
# Add the next token to the sequence
tokens.append(next_token)
# Check if the end of the sentence has been reached
if next_token == tokenizer.word_index['.'] or len(tokens) == max_sequence_length:
break
# Convert the tokens back to text
generated_text = tokenizer.sequences_to_texts([tokens])[0]
return generated_text
# Generate some text
generated_text = generate_text(model, tokenizer, max_sequence_length, 'Hello, my name is Bard.')
print(generated_text)
The following worked with TensorFlow 2.12.0. You can see where the Tokenizer class and pad_sequences function are defined on the linked GitHub blobs for TensorFlow 2.12.0. What these modules are called and how TensorFlow/Keras are structured has changed a few times so the correct import statements will be version specific.
Change:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
To:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences