deep-learningneural-networkpytorchsoftmax

Why does Softmax(dim=0) produce poor results?


I'm getting weird results from a PyTorch Softmax layer, trying to figure out what's going on, so I boiled it down to a minimal test case, a neural network that just learns to decode binary numbers into one-hot form.

Just Softmax() gets a warning:

UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.

Okay, so what to supply for X? I had been guessing 0 would be a sensible argument. Just to make sure, I tried Softmax(dim=1):

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

Okay, so that seems clear about allowed values. -1 apparently means the last dimension, so in this case, where the output is just a one-dimensional vector, that should mean the same thing as 0. Trying it with Softmax(dim=-1) works fine; in a few thousand epochs, the network reliably learns to decode the numbers with 100% accuracy.

Just to make sure it gives the same results, I tried it again with Softmax(dim=0) (as shown below)...

And it does not give the same result at all. The accuracy oscillates, but levels off somewhere around 20-30%.

What's going on? Why is 0 not the same as -1 in this context, and what exactly is 0 doing?

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


bits = 5


class Dataset1(Dataset):
    def __init__(self):
        s = []
        for i in range(1 << bits):
            x = []
            for c in format(i, "b").zfill(bits):
                x.append(float(c == "1"))

            y = []
            for j in range(1 << bits):
                y.append(float(i == j))

            x = torch.as_tensor(x)
            y = torch.as_tensor(y)
            s.append((x, y))
        self.s = s

    def __len__(self):
        return len(self.s)

    def __getitem__(self, i):
        return self.s[i]


trainDs = Dataset1()

batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize)

for x, y in trainDl:
    print(x.shape)
    print(y.shape)
    break

hiddenSize = 100


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(bits, hiddenSize),
            nn.ReLU(),
            nn.Linear(hiddenSize, hiddenSize),
            nn.Tanh(),
            nn.Linear(hiddenSize, hiddenSize),
            nn.ReLU(),
            nn.Linear(hiddenSize, 1 << bits),
            nn.Softmax(dim=0),
        )

    def forward(self, x):
        return self.layers(x)


device = torch.device("cpu")
model = Net().to(device)


def accuracy(model, ds):
    n = 0
    for x, y in ds:
        with torch.no_grad():
            z = model(x)
        if torch.argmax(y) == torch.argmax(z):
            n += 1
    return n / len(ds)


criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
    for bi, (x, y) in enumerate(trainDl):
        x = x.to(device)
        y = y.to(device)

        loss = criterion(model(x), y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % interval == 0 and not bi:
            print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")

Solution

  • In the accuracy function, you forgot to create a new dimension for the batch (batchsize=1), which explains why it gives that error when you use dim=1. Regarding the dimension of the softmax, you can check this post.

    Below is the modified code.

    import torch
    from torch import nn
    from torch.utils.data import Dataset, DataLoader
    
    
    bits = 5
    
    
    class Dataset1(Dataset):
        def __init__(self):
            s = []
            for i in range(1 << bits):
                x = []
                for c in format(i, "b").zfill(bits):
                    x.append(float(c == "1"))
    
                y = []
                for j in range(1 << bits):
                    y.append(float(i == j))
    
                x = torch.as_tensor(x)
                y = torch.as_tensor(y)
                s.append((x, y))
            self.s = s
    
        def __len__(self):
            return len(self.s)
    
        def __getitem__(self, i):
            return self.s[i]
    
    
    trainDs = Dataset1()
    
    batchSize = 16
    trainDl = DataLoader(trainDs, batch_size=batchSize, drop_last=True)
    
    for x, y in trainDl:
        print(x.shape)
        print(y.shape)
        break
    
    hiddenSize = 100
    
    
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.layers = nn.ModuleList(
                [nn.Linear(bits, hiddenSize),
                nn.ReLU(),
                nn.Linear(hiddenSize, hiddenSize),
                nn.Tanh(),
                nn.Linear(hiddenSize, hiddenSize),
                nn.ReLU(),
                nn.Linear(hiddenSize, 1 << bits),
                nn.Softmax(dim=1)]
            )
    
        def forward(self, x):
            for i,layer in enumerate(self.layers):
                x = layer(x)
                if i == 6:
                    pass
                    #print('softmax input shape',x.shape)
                    #print('softmax output shape',torch.nn.functional.softmax(x,dim=1).shape)
                    #print('linear',x.shape)
            #print('output',x.shape)
            return x
    
    
    device = torch.device("cpu")
    model = Net().to(device)
    
    
    def accuracy(model, ds):
        n = 0
        for x, y in ds:
            x = x.unsqueeze(0) # create a batch of size 1
            y = y.unsqueeze(0) # create a batch of size 1
            with torch.no_grad():
                z = model(x)
                print(z.shape)
                break
            if torch.argmax(y) == torch.argmax(z):
                n += 1
        return n / len(ds)
    
    
    criterion = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    epochs = 10000
    interval = epochs // 10
    for epoch in range(epochs + 1):
        for bi, (x, y) in enumerate(trainDl):
            x = x.to(device)
            y = y.to(device)
    
            loss = criterion(model(x), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            if epoch % interval == 0 and not bi:
                print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")