pythonpytorchlstm

Why can't my LSTM determine if a sequence is odd or even in the number of ones?


I am trying to understand LSTMs and wanted to implement a simple example of classifying a sequence as "0" if the number of "1" in the sequence is odd and as "1" if the number of "1" is even. This is my data generation and training routine:

import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from Dataset import LSTMDataset # Custom Dataset
from Network import LSTMNet # Custom Network

if __name__ == "__main__":

    numSamples = 1000
    sampleLength = 5

    samples = np.ndarray( shape=( numSamples, sampleLength ), dtype=np.float32 )
    labels = np.ndarray( shape=( numSamples ), dtype=np.float32 )
    for s in range( numSamples ):
        sample = np.random.choice( [ 0, 1 ], size=sampleLength )
        samples[ s ] = sample
        even = np.count_nonzero( sample == 1 ) % 2 == 0
        labels[ s ] = int( even )
    
    X_train, X_test, y_train, y_test = train_test_split( samples, labels, test_size=0.25, random_state=42 )

    trainingSet = LSTMDataset( X_train, y_train )
    testSet = LSTMDataset( X_test, y_test )

    training_loader = DataLoader( trainingSet, batch_size=1, shuffle=True )
    validation_loader = DataLoader( testSet, batch_size=1, shuffle=False )
    
    model = LSTMNet( inputSize= sampleLength )
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    loss_fn = torch.nn.BCELoss()

    for epoch in range( 10 ):
        yPredicted = []
        yTruth = []
        for i, data in enumerate( training_loader ):
            inputs, labels = data

            optimizer.zero_grad()

            outputs = model(inputs)

            loss = loss_fn(outputs, labels)
            loss.backward()

            optimizer.step()

            yTruth.append( int( labels.item() ) )
            yPredicted.append( int( torch.round( outputs ).item() ) )

        accuracy = accuracy_score( yTruth, yPredicted )
        print( f"Accuracy: {accuracy:.2f}" )

My dataset and network:

class LSTMDataset( Dataset ):
    def __init__( self, x, y ):
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[ 0 ]

    def __getitem__(self, idx):
        sample, label = self.x[ idx ], self.y[ idx ]
        return sample.reshape( ( -1, 1 ) ), label.reshape( ( 1 ) )


class LSTMNet( nn.Module ):
    def __init__( self, sequenceLength ):
        super().__init__()
        self.hidden_size = 10
        self.lstm = nn.LSTM( input_size=1, hidden_size=self.hidden_size, num_layers=2, batch_first=True )
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.ReLU(),
            nn.Linear( sequenceLength * self.hidden_size, 1 ),
            nn.Sigmoid()
        )

    def forward(self, x):
        x, _ = self.lstm( x )
        x = self.net( x )
        return x

But unfortunately, my training accuracy never goes beyond 53%. Does anyone have any tips what I am doing wrong?

The input shape to my network is ( 1, 5, 1 ) and I wanted to fed the sequence elements one after another to my network that's why I chose ( 1, 5, 1 ) and not (1, 1, 5 ).


Solution

  • You're putting a bunch of 0 values directly into the network. Any value multiplied by 0 is 0. The 0s are killing your signal through the model. Replace the inputs with a learned embedding

    class LSTMNet( nn.Module ):
        def __init__( self, sequenceLength ):
            super().__init__()
            self.hidden_size = 10
            
            # added embedding layer
            self.embedding = nn.Embedding(2, self.hidden_size)
            self.lstm = nn.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size, 
                                num_layers=1, batch_first=True )
            self.net = nn.Sequential(
                nn.Flatten(),
                # added layer here, see note
                nn.Linear( sequenceLength * self.hidden_size, sequenceLength * self.hidden_size ),
                nn.ReLU(),
                nn.Linear( sequenceLength * self.hidden_size, 1 ),
                nn.Sigmoid()
            )
    
        def forward(self, x):
            # remove unit axis so x is size (batch_size, sequence_length)
            # convert to long type for embedding
            x = self.embedding(x.squeeze(-1).long())
            x, _ = self.lstm( x )
            x = self.net( x )
            return x
    

    The model has an added embedding layer. I also added another linear layer in the sequential section. Strictly spearking is optional but greatly improves convergence. The output of the LSTM comes from a tanh function which means about half your values are below 0. Going LSTM -> ReLU throws away these values. The model can compensate, but it will learn faster with a linear layer between the LSTM and ReLU.

    Full code:

    import torch
    import torch.nn as nn
    import numpy as np
    from torch.utils.data import DataLoader, Dataset
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    
    class LSTMDataset( Dataset ):
        def __init__( self, x, y ):
            self.x = x
            self.y = y
    
        def __len__(self):
            return self.y.shape[ 0 ]
    
        def __getitem__(self, idx):
            sample, label = self.x[ idx ], self.y[ idx ]
            return sample.reshape( ( -1, 1 ) ), label.reshape( ( 1 ) )
    
    class LSTMNet( nn.Module ):
        def __init__( self, sequenceLength ):
            super().__init__()
            self.hidden_size = 10
            
            # added embedding layer
            self.embedding = nn.Embedding(2, self.hidden_size)
            self.lstm = nn.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size, 
                                num_layers=1, batch_first=True )
            self.net = nn.Sequential(
                nn.Flatten(),
                # added layer here, see note
                nn.Linear( sequenceLength * self.hidden_size, sequenceLength * self.hidden_size ),
                nn.ReLU(),
                nn.Linear( sequenceLength * self.hidden_size, 1 ),
                nn.Sigmoid()
            )
    
        def forward(self, x):
            # remove unit axis so x is size (batch_size, sequence_length)
            # convert to long type for embedding
            x = self.embedding(x.squeeze(-1).long())
            x, _ = self.lstm( x )
            x = self.net( x )
            return x
    
    numSamples = 1000
    sampleLength = 5
    
    samples = np.ndarray( shape=( numSamples, sampleLength ), dtype=np.float32 )
    labels = np.ndarray( shape=( numSamples ), dtype=np.float32 )
    for s in range( numSamples ):
        sample = np.random.choice( [ 0, 1 ], size=sampleLength )
        samples[ s ] = sample
        even = np.count_nonzero( sample == 1 ) % 2 == 0
        labels[ s ] = int( even )
    
    X_train, X_test, y_train, y_test = train_test_split( samples, labels, test_size=0.25, random_state=42 )
    
    trainingSet = LSTMDataset( X_train, y_train )
    testSet = LSTMDataset( X_test, y_test )
    
    # note you should use a larger batch size
    training_loader = DataLoader( trainingSet, batch_size=1, shuffle=True )
    validation_loader = DataLoader( testSet, batch_size=1, shuffle=False )
    
    model = LSTMNet( sampleLength )
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    loss_fn = torch.nn.BCELoss()
    
    for epoch in range( 20 ):
        yPredicted = []
        yTruth = []
        for i, data in enumerate( training_loader ):
            inputs, labels = data
    
            optimizer.zero_grad()
    
            outputs = model(inputs)
    
            loss = loss_fn(outputs, labels)
            loss.backward()
    
            optimizer.step()
    
            yTruth.append(labels.detach() )
            yPredicted.append( torch.round( outputs.detach() ) )
    
        accuracy = accuracy_score( torch.cat(yTruth), torch.cat(yPredicted) )
        print( f"Accuracy: {accuracy:.2f}" )
    

    All that said, your model hard-codes the sequence length. In this scenario, it doesn't really make sense to use a LSTM to begin with. LSTMs are for variable sequence length tasks. If you have a hard-coded sequence length, you can just use a MLP

    class MLPNet( nn.Module ):
        def __init__( self, sequenceLength ):
            super().__init__()
            self.net = nn.Sequential(
                nn.Flatten(),
                nn.Linear( sequenceLength, sequenceLength ),
                nn.ReLU(),
                nn.Linear( sequenceLength, 1 ),
                nn.Sigmoid()
            )
    
        def forward(self, x):
            x = self.net( x )
            return x