Missmatching target size in criterion

I'm trying to use pytorch in for the IMBD dataset, to predict the positive and negative reviews. When I get to the training state, the following error is given by the criterion function:

ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1136, 64, 1]))

After some research, I saw that the error is because the output of the model is returning a tensor of size [1136, 64, 1], and criterion is expecting only batch results.

Howerver, I don't know how to solve this error.

My code:

import torch
import spacy
import torch.nn as nn
from torchtext.legacy import data
import sys
import csv
import torch.optim as optim
import re
import nltk
from nltk.corpus import stopwords
from torchtext import vocab
from torchtext.legacy.data import Field
from torchtext.legacy import datasets
import pandas as pd
import re  
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random

SEED = 1234

torch.manual_seed(SEED) # For reproducibility
torch.backends.cudnn.deterministic = True

import torch.nn.functional as F
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        #text = [sent len, batch size]
        embedded = self.embedding(text)
        
        h_1 = F.relu(self.hidden_fc(embedded))
        
        # assert torch.equal(output[-1,:,:], h_1.squeeze(0))
                # [batch size, output dim]
        return self.fc(h_1.squeeze(0))


def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds)) # 0.75 --> 1 0.4 --> 0
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() #Train mode is on
    
    for batch in iterator:
        
        optimizer.zero_grad() #Reset the gradients
        predictions = model(batch.text) ## forward propagation
        print(predictions.shape)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward() ## backward propagation / calculate gradients
        optimizer.step() ## update parameters
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() #Evaluation mode is on
    
    with torch.no_grad():

        for batch in iterator:
            predictions = model(batch.text).squeeze(1) 
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



TEXT = data.Field(tokenize = 'spacy', 
                  tokenizer_language = 'en_core_web_sm',
                  lower = True)
                  

LABEL = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)  ## IMDB reviews dataset
train_data, valid_data = train_data.split(random_state = random.seed(SEED)) 

MAX_VOCAB_SIZE = 25_000 

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) #Build the vocabulary using the top frequent 25K words
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE)


INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = MLP(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()


N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

Solution

To summarize your problem, you have reviews you want to classify as positive or negative. To do so you train an embedding space to map each word to a vector, then output a probability for each sentence and supervised with the corresponding label using a binary cross-entropy loss nn.BCELossWithLogits.

You current model is comprised of:

nn.Embedding: embeds each word in the sequence independently thus converting the input tensor shape from (seq_len, batch_size) to (seq_len, batch_size, embedding_dim). Where seq_len is the number of tokens in your input sequence.
nn.Linear layer reduces the dimensionality by projecting the features, the tensor shape is converted from (seq_len, batch_size, embedding_dim) to (seq_len, batch_size, hidden_dim).
A non-linearity layer is applied to the sequence of word vectors. Note how the structure of the sentence is retained. And finally, apply a second linear layer to map from (seq_len, batch_size, hidden_dim) to (seq_len, batch_size, output_dim). Still with the sentence structure (cf. the dim=0 with seq_len).

This is the reason why you are getting (1136, 64, 1) as the predictions shape: 1136 must be your sequence length, 64 is BATCH_SIZE, while 1 is OUTPUT_DIM.

Yet you are trying to classify each sequence as a whole, what you would need instead is a single tensor or scalar value per sentence, i.e. a shape of (1, 64, 1). This implies reducing the first dimension corresponding to the sequence dimension, to a single value.

A straightforward and easy way to reduce the dimension such that you can represent the whole sentence with a single vector is by applying an average pool to the sentence. The average vector of the words in each sentence should give you the sentiment of the positiveness/negativeness of the overall sentence. You can apply this operator before the final projection to remain in a relatively high dimension, either with nn.AdaptiveAvgPool1d with an output size of 1 or simply torch.Tensor.mean.

Here is a possible implementation with nn.AdaptiveAvgPool1d:

class MLP(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
        self.avg = nn.AdaptiveAvgPool1d(1) # reduces (N, C, L_in) to (N, C, 1)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # (seq_len, batch_size) = (1136, 64)
        embedded = self.embedding(text)
        # (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
        h_1 = F.relu(self.hidden_fc(embedded))
        # (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
        avg = self.avg(h_1.permute(1,2,0))
        # (batch_size, hidden_dim, 1) = (64, 256, 1)
        out = self.fc(avg.squeeze(-1))
        # (batch_size, 1, 1) = (64, 1)
        return out

Or with torch.Tensor.mean:

class MLP(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # (seq_len, batch_size) = (1136, 64)
        embedded = self.embedding(text)
        # (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
        h_1 = F.relu(self.hidden_fc(embedded))
        # (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
        avg = h_1.mean(0)
        # (batch_size, hidden_dim, 1) = (64, 256)
        out = self.fc(avg)
        # (batch_size, 1, 1) = (64, 1)
        return out

Alternative methods involve using more sophisticated neural network layers such as recurrent neural network blocks (nn.RNN, nn.LSTM, nn.GRU)...