I'm trying to use pytorch in for the IMBD dataset, to predict the positive and negative reviews. When I get to the training state, the following error is given by the criterion
function:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1136, 64, 1]))
After some research, I saw that the error is because the output of the model is returning a tensor of size [1136, 64, 1]
, and criterion
is expecting only batch
results.
Howerver, I don't know how to solve this error.
My code:
import torch
import spacy
import torch.nn as nn
from torchtext.legacy import data
import sys
import csv
import torch.optim as optim
import re
import nltk
from nltk.corpus import stopwords
from torchtext import vocab
from torchtext.legacy.data import Field
from torchtext.legacy import datasets
import pandas as pd
import re
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random
SEED = 1234
torch.manual_seed(SEED) # For reproducibility
torch.backends.cudnn.deterministic = True
import torch.nn.functional as F
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
#text = [sent len, batch size]
embedded = self.embedding(text)
h_1 = F.relu(self.hidden_fc(embedded))
# assert torch.equal(output[-1,:,:], h_1.squeeze(0))
# [batch size, output dim]
return self.fc(h_1.squeeze(0))
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds)) # 0.75 --> 1 0.4 --> 0
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train() #Train mode is on
for batch in iterator:
optimizer.zero_grad() #Reset the gradients
predictions = model(batch.text) ## forward propagation
print(predictions.shape)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward() ## backward propagation / calculate gradients
optimizer.step() ## update parameters
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval() #Evaluation mode is on
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
lower = True)
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) ## IMDB reviews dataset
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) #Build the vocabulary using the top frequent 25K words
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = MLP(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
To summarize your problem, you have reviews you want to classify as positive or negative. To do so you train an embedding space to map each word to a vector, then output a probability for each sentence and supervised with the corresponding label using a binary cross-entropy loss nn.BCELossWithLogits
.
You current model is comprised of:
nn.Embedding
: embeds each word in the sequence independently thus converting the input tensor shape from (seq_len, batch_size)
to (seq_len, batch_size, embedding_dim)
. Where seq_len
is the number of tokens in your input sequence.
nn.Linear
layer reduces the dimensionality by projecting the features, the tensor shape is converted from (seq_len, batch_size, embedding_dim)
to (seq_len, batch_size, hidden_dim)
.
A non-linearity layer is applied to the sequence of word vectors. Note how the structure of the sentence is retained. And finally, apply a second linear layer to map from (seq_len, batch_size, hidden_dim)
to (seq_len, batch_size, output_dim)
. Still with the sentence structure (cf. the dim=0
with seq_len
).
This is the reason why you are getting (1136, 64, 1)
as the predictions
shape: 1136
must be your sequence length, 64
is BATCH_SIZE
, while 1
is OUTPUT_DIM
.
Yet you are trying to classify each sequence as a whole, what you would need instead is a single tensor or scalar value per sentence, i.e. a shape of (1, 64, 1)
. This implies reducing the first dimension corresponding to the sequence dimension, to a single value.
A straightforward and easy way to reduce the dimension such that you can represent the whole sentence with a single vector is by applying an average pool to the sentence. The average vector of the words in each sentence should give you the sentiment of the positiveness/negativeness of the overall sentence. You can apply this operator before the final projection to remain in a relatively high dimension, either with nn.AdaptiveAvgPool1d
with an output size of 1
or simply torch.Tensor.mean
.
Here is a possible implementation with nn.AdaptiveAvgPool1d
:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.avg = nn.AdaptiveAvgPool1d(1) # reduces (N, C, L_in) to (N, C, 1)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = self.avg(h_1.permute(1,2,0))
# (batch_size, hidden_dim, 1) = (64, 256, 1)
out = self.fc(avg.squeeze(-1))
# (batch_size, 1, 1) = (64, 1)
return out
Or with torch.Tensor.mean
:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = h_1.mean(0)
# (batch_size, hidden_dim, 1) = (64, 256)
out = self.fc(avg)
# (batch_size, 1, 1) = (64, 1)
return out
Alternative methods involve using more sophisticated neural network layers such as recurrent neural network blocks (nn.RNN
, nn.LSTM
, nn.GRU
)...