deep-learning neural-network pytorch softmax

Why does Softmax(dim=0) produce poor results?

I'm getting weird results from a PyTorch Softmax layer, trying to figure out what's going on, so I boiled it down to a minimal test case, a neural network that just learns to decode binary numbers into one-hot form.

Just Softmax() gets a warning:

UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.

Okay, so what to supply for X? I had been guessing 0 would be a sensible argument. Just to make sure, I tried Softmax(dim=1):

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

Okay, so that seems clear about allowed values. -1 apparently means the last dimension, so in this case, where the output is just a one-dimensional vector, that should mean the same thing as 0. Trying it with Softmax(dim=-1) works fine; in a few thousand epochs, the network reliably learns to decode the numbers with 100% accuracy.

Just to make sure it gives the same results, I tried it again with Softmax(dim=0) (as shown below)...

And it does not give the same result at all. The accuracy oscillates, but levels off somewhere around 20-30%.

What's going on? Why is 0 not the same as -1 in this context, and what exactly is 0 doing?

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


bits = 5


class Dataset1(Dataset):
    def __init__(self):
        s = []
        for i in range(1 << bits):
            x = []
            for c in format(i, "b").zfill(bits):
                x.append(float(c == "1"))

            y = []
            for j in range(1 << bits):
                y.append(float(i == j))

            x = torch.as_tensor(x)
            y = torch.as_tensor(y)
            s.append((x, y))
        self.s = s

    def __len__(self):
        return len(self.s)

    def __getitem__(self, i):
        return self.s[i]


trainDs = Dataset1()

batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize)

for x, y in trainDl:
    print(x.shape)
    print(y.shape)
    break

hiddenSize = 100


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(bits, hiddenSize),
            nn.ReLU(),
            nn.Linear(hiddenSize, hiddenSize),
            nn.Tanh(),
            nn.Linear(hiddenSize, hiddenSize),
            nn.ReLU(),
            nn.Linear(hiddenSize, 1 << bits),
            nn.Softmax(dim=0),
        )

    def forward(self, x):
        return self.layers(x)


device = torch.device("cpu")
model = Net().to(device)


def accuracy(model, ds):
    n = 0
    for x, y in ds:
        with torch.no_grad():
            z = model(x)
        if torch.argmax(y) == torch.argmax(z):
            n += 1
    return n / len(ds)


criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
    for bi, (x, y) in enumerate(trainDl):
        x = x.to(device)
        y = y.to(device)

        loss = criterion(model(x), y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % interval == 0 and not bi:
            print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")

Solution

In the accuracy function, you forgot to create a new dimension for the batch (batchsize=1), which explains why it gives that error when you use dim=1. Regarding the dimension of the softmax, you can check this post.

Below is the modified code.

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


bits = 5


class Dataset1(Dataset):
    def __init__(self):
        s = []
        for i in range(1 << bits):
            x = []
            for c in format(i, "b").zfill(bits):
                x.append(float(c == "1"))

            y = []
            for j in range(1 << bits):
                y.append(float(i == j))

            x = torch.as_tensor(x)
            y = torch.as_tensor(y)
            s.append((x, y))
        self.s = s

    def __len__(self):
        return len(self.s)

    def __getitem__(self, i):
        return self.s[i]


trainDs = Dataset1()

batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize, drop_last=True)

for x, y in trainDl:
    print(x.shape)
    print(y.shape)
    break

hiddenSize = 100


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layers = nn.ModuleList(
            [nn.Linear(bits, hiddenSize),
            nn.ReLU(),
            nn.Linear(hiddenSize, hiddenSize),
            nn.Tanh(),
            nn.Linear(hiddenSize, hiddenSize),
            nn.ReLU(),
            nn.Linear(hiddenSize, 1 << bits),
            nn.Softmax(dim=1)]
        )

    def forward(self, x):
        for i,layer in enumerate(self.layers):
            x = layer(x)
            if i == 6:
                pass
                #print('softmax input shape',x.shape)
                #print('softmax output shape',torch.nn.functional.softmax(x,dim=1).shape)
                #print('linear',x.shape)
        #print('output',x.shape)
        return x


device = torch.device("cpu")
model = Net().to(device)


def accuracy(model, ds):
    n = 0
    for x, y in ds:
        x = x.unsqueeze(0) # create a batch of size 1
        y = y.unsqueeze(0) # create a batch of size 1
        with torch.no_grad():
            z = model(x)
            print(z.shape)
            break
        if torch.argmax(y) == torch.argmax(z):
            n += 1
    return n / len(ds)


criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
    for bi, (x, y) in enumerate(trainDl):
        x = x.to(device)
        y = y.to(device)

        loss = criterion(model(x), y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % interval == 0 and not bi:
            print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")