deep-learningpytorchneural-networklstmrecurrent-neural-network

Test loss immediately goes up on LSTM


I'm trying to create an LSTM that predicts the sixth sports match for team A based on a sequence of 5 previous matches. My data is set up in a structure like this. Team A game 1 vs random team, team B game 1 vs random team, ... Team A game 5 vs random team, team B game 5 vs random team. Team B is the team that Team A is playing in the sixth game and the result is the output. Each sequence consists of 124 features which is the combination of both team A's i'th game and team B's i'th game.

My problem is that my test loss immediately goes up, and I cant seem to be able to every have it decrease consistently. I've messed around with the hyperparameters, but none of them seem to have a noticeable effect. What can I do?

import numpy as np
import torch
from torch import nn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader


def main():
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(torch.cuda.is_available())
    print(f'Running on device: {device}')

    # Process data
    data = pd.read_csv('matchup_data.csv')

    # Columns that are one-hot encoded for the label
    labels = data['label']

    # Remove the original one-hot encoded label columns from the features data
    data = data.drop('label', axis=1)

    num_features = 124
    samples = 531
    timesteps = 5

    # Convert features and labels to tensors
    dataT = torch.tensor(data.values).float()
    dataT = dataT.view(samples, timesteps, num_features)

    labelsT = torch.tensor(labels.values).float()
    labelsT = labelsT.unsqueeze(1)

    print(dataT)

    # Split to test and train data
    train_data, test_data, train_labels, test_labels = train_test_split(dataT, labelsT, test_size=.1)

    train_dataset = TensorDataset(train_data, train_labels)
    test_dataset = TensorDataset(test_data, test_labels)

    batch_size = 2  # Choose a batch size that fits your data and model

    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

    # Layer parameters
    input_size = 124
    hidden_size = 64
    num_layers = 2
    output_size = 1

    # Net and net parameters
    net = LSTMnet(input_size, output_size, hidden_size, num_layers).to(device)
    print(net)
    loss_function = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=0.00001)

    train_accuracy, train_losses, test_accuracy, test_losses = trainModel(100, net, optimizer, loss_function,
                                                                          train_loader, test_loader, device)



    print(np.max(train_accuracy))
    print(np.min(train_losses))
    print(np.max(test_accuracy))
    print(np.min(test_losses))

    # Plot accuracy and loss
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(train_accuracy, label='Train Accuracy')
    plt.plot(test_accuracy, label='Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


class LSTMnet(nn.Module):
    def __init__(self, input_size, output_size, num_hidden, num_layers):
        super().__init__()

        self.input_size = input_size
        self.num_hidden = num_hidden
        self.num_layers = num_layers

        # RNN layer
        self.lstm = nn.LSTM(input_size, num_hidden, num_layers)
        self.dropout = nn.Dropout(0.6)

        # linear layer for output
        self.out = nn.Linear(num_hidden, output_size)

    def forward(self, x):
        # Run through RNN layer
        y, hidden = self.lstm(x)

        # pass through dropout
        y = self.dropout(y)
        # Pass to linear layer
        output = self.out(y)

        return output, hidden


def trainModel(num_epochs, net, optimizer, loss_function, train_data, test_data, device):
    # Variable initialization
    train_accuracy = np.zeros(num_epochs)
    train_losses = np.zeros(num_epochs)
    test_accuracy = np.zeros(num_epochs)
    test_losses = np.zeros(num_epochs)

    for epochi in range(num_epochs):
        net.train()

        segment_loss = []
        segment_accuracy = []
        for X, y in train_data:
            X = X.to(device)
            y = y.to(device)
            output, _ = net(X)  # Unpack the tuple to get the output
            output = output[:, -1, :]
            loss = loss_function(output, y)  # Use .squeeze() to ensure the dimensions match

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Convert output logits to probabilities using sigmoid
            probabilities = torch.sigmoid(output)
            # Convert probabilities to binary predictions
            predicted = (probabilities > 0.5).float()
            # Calculate accuracy
            acc = (predicted == y).float().mean() * 100

            segment_loss.append(loss.item())
            segment_accuracy.append(acc.item())

        train_losses[epochi] = np.mean(segment_loss)
        train_accuracy[epochi] = np.mean(segment_accuracy)

        net.eval()
        test_loss = []
        test_acc = []

        with torch.no_grad():
            for X, y in test_data:
                X = X.to(device)
                y = y.to(device)
                output, _ = net(X)  # Unpack the tuple to get the output
                output = output[:, -1, :]
                loss = loss_function(output, y)  # Use .squeeze() to ensure the dimensions match

                # Convert output logits to probabilities using sigmoid
                probabilities = torch.sigmoid(output)
                # Convert probabilities to binary predictions
                predicted = (probabilities > 0.5).float()
                # Calculate accuracy
                acc = (predicted == y).float().mean() * 100

                test_loss.append(loss.item())
                test_acc.append(acc.item())

            test_losses[epochi] = np.mean(test_loss)
            test_accuracy[epochi] = np.mean(test_acc)

    return train_accuracy, train_losses, test_accuracy, test_losses


if __name__ == "__main__":
    main()

enter image description here


Solution

  • My understanding is that the shape of X in net(X) is (2, 5, 62 + 62), and the shape of y in loss_function(output, y) is (2, 1). The input sequence is [A vs. rand + B vs. rand (game 1), ..., A vs. rand + B vs. rand (game 5)]. The output corresponds to [A vs. B (game 6)].

    If your data is arranged (seq_length, batch_size, n_features), which I think is the case as per your comment above, then I think you'll need to change the line:

    #output = output[:, -1, :] #accesses last sample only
    output = output[-1, :, :] #corrected - access last frame from each sample
    

    Currently, it's reading out the last sample from each batch rather than reading out the last frame from each sample. As a result, it's optimising based on only 1 sample per batch, which might be causing the overfitting behaviour.

    The correction would need to be applied in both the train and test loops.

    To avoid overfitting, try decreasing n_layers to 1, and hidden_size to 16 (perhaps remove lr= from Adam to use the default). This is because the input dimensionality seems a bit large, and the number of samples might be small in relative terms. This should result in it not diverging as rapidly.

    If that works, another thing you might like to try is using a Conv1d layer to 'compress' the 124-dimensional input down to something smaller. That should retain the important information whilst also reducing the dimensionality of the data, mitigating overfitting. Here's a simple way of modifying the existing code without needing to make any other changes:

    #Just the LSTM
    #net = LSTMnet(input_size, output_size, hidden_size, num_layers).to(device)
    
    #Conv1d to reduce feature size from 124 down to 32, followed by an LSTM
    net = nn.Sequential(
      nn.Conv1d(in_channels=124, out_channels=64, kernel_size=1),
      nn.BatchNorm1d(64),
      nn.ReLU(),
    
      nn.Conv1d(in_channels=64, out_channels=32, kernel_size=1),
      nn.BatchNorm1d(32),
      nn.ReLU(),
    
      #Finally, the LSTM
      LSTMnet(input_size=32, output_size=output_size, num_hidden=32, num_layers=2)
    ).to(device)
    

    You could start with just the the first nn.Conv1d line, and add the other lines depending on whether you think it's worth it. The input and output shapes to net don't change; it's just that internally it maps the feature down size to something smaller before feeding it to an LSTM.

    About the test loss going down then up...whilst the training loss should keep going down, the test loss may well go down then up. This happens naturally and just indicates that the model is now overfitting. You'd just need to stop training before it starts overfitting (if you train for long enough you'll almost always eventually see the down-then-up test loss trend). I usually track test score (e.g. accuracy in your case), and stop training when the score hits its peak before it starts to decline, indicating overfitting.


    In my answer above I'd originally used a kernel_size of 1, but that means convolutional layers aren't pulling in information from other timesteps, and therefore the data fed into the LSTM isn't as useful as it could be.

    You could try increasing kernel_size, whilst at the same time setting padding='same' if you want to preserve the same sequence lengths. Playing with dilation= also allows the conv layer to see other parts of the sequence, and might be worth looking at if you have success with kernel_size.

    Note that as long as you're only using the final output of the LSTM, you're okay to use Conv1d (where the kernel looks both back and ahead in time). Otherwise, I think you'd need to switch to a causal convolutional layer. This would be relevant if you switch to a sequence-to-sequence architecture (currently it's sequence-to-vector, but there can be convergence benefits to reframing it as sequence-to-vector).