I'm getting weird results from a PyTorch Softmax layer, trying to figure out what's going on, so I boiled it down to a minimal test case, a neural network that just learns to decode binary numbers into one-hot form.
Just Softmax()
gets a warning:
UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Okay, so what to supply for X? I had been guessing 0 would be a sensible argument. Just to make sure, I tried Softmax(dim=1)
:
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
Okay, so that seems clear about allowed values. -1 apparently means the last dimension, so in this case, where the output is just a one-dimensional vector, that should mean the same thing as 0. Trying it with Softmax(dim=-1)
works fine; in a few thousand epochs, the network reliably learns to decode the numbers with 100% accuracy.
Just to make sure it gives the same results, I tried it again with Softmax(dim=0)
(as shown below)...
And it does not give the same result at all. The accuracy oscillates, but levels off somewhere around 20-30%.
What's going on? Why is 0 not the same as -1 in this context, and what exactly is 0 doing?
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
bits = 5
class Dataset1(Dataset):
def __init__(self):
s = []
for i in range(1 << bits):
x = []
for c in format(i, "b").zfill(bits):
x.append(float(c == "1"))
y = []
for j in range(1 << bits):
y.append(float(i == j))
x = torch.as_tensor(x)
y = torch.as_tensor(y)
s.append((x, y))
self.s = s
def __len__(self):
return len(self.s)
def __getitem__(self, i):
return self.s[i]
trainDs = Dataset1()
batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize)
for x, y in trainDl:
print(x.shape)
print(y.shape)
break
hiddenSize = 100
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.Sequential(
nn.Linear(bits, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, hiddenSize),
nn.Tanh(),
nn.Linear(hiddenSize, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, 1 << bits),
nn.Softmax(dim=0),
)
def forward(self, x):
return self.layers(x)
device = torch.device("cpu")
model = Net().to(device)
def accuracy(model, ds):
n = 0
for x, y in ds:
with torch.no_grad():
z = model(x)
if torch.argmax(y) == torch.argmax(z):
n += 1
return n / len(ds)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
for bi, (x, y) in enumerate(trainDl):
x = x.to(device)
y = y.to(device)
loss = criterion(model(x), y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % interval == 0 and not bi:
print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")
In the accuracy function, you forgot to create a new dimension for the batch (batchsize=1), which explains why it gives that error when you use dim=1
. Regarding the dimension of the softmax, you can check this post.
Below is the modified code.
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
bits = 5
class Dataset1(Dataset):
def __init__(self):
s = []
for i in range(1 << bits):
x = []
for c in format(i, "b").zfill(bits):
x.append(float(c == "1"))
y = []
for j in range(1 << bits):
y.append(float(i == j))
x = torch.as_tensor(x)
y = torch.as_tensor(y)
s.append((x, y))
self.s = s
def __len__(self):
return len(self.s)
def __getitem__(self, i):
return self.s[i]
trainDs = Dataset1()
batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize, drop_last=True)
for x, y in trainDl:
print(x.shape)
print(y.shape)
break
hiddenSize = 100
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.ModuleList(
[nn.Linear(bits, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, hiddenSize),
nn.Tanh(),
nn.Linear(hiddenSize, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, 1 << bits),
nn.Softmax(dim=1)]
)
def forward(self, x):
for i,layer in enumerate(self.layers):
x = layer(x)
if i == 6:
pass
#print('softmax input shape',x.shape)
#print('softmax output shape',torch.nn.functional.softmax(x,dim=1).shape)
#print('linear',x.shape)
#print('output',x.shape)
return x
device = torch.device("cpu")
model = Net().to(device)
def accuracy(model, ds):
n = 0
for x, y in ds:
x = x.unsqueeze(0) # create a batch of size 1
y = y.unsqueeze(0) # create a batch of size 1
with torch.no_grad():
z = model(x)
print(z.shape)
break
if torch.argmax(y) == torch.argmax(z):
n += 1
return n / len(ds)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
for bi, (x, y) in enumerate(trainDl):
x = x.to(device)
y = y.to(device)
loss = criterion(model(x), y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % interval == 0 and not bi:
print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")