The dataset is CIFAR10. I've created a VGG-like network:
class FirstModel(nn.Module):
def __init__(self):
super(FirstModel, self).__init__()
self.vgg1 = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.Conv2d(16, 16, 3, padding=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.vgg2 = nn.Sequential(
nn.Conv2d(16, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.vgg3 = nn.Sequential(
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.fc1 = nn.Linear(4 * 4 * 64, 4096)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, 10)
self.softmax = nn.Softmax()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.vgg3(self.vgg2(self.vgg1(x)))
x = nn.Flatten()(x)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.softmax(self.fc3(x))
return x
Then I train it and visualize loss and accuracy:
import matplotlib.pyplot as plt
from IPython.display import clear_output
def plot_history(train_history, val_history, title='loss'):
plt.figure()
plt.title('{}'.format(title))
plt.plot(train_history, label='train', zorder=1)
points = np.array(val_history)
steps = list(range(0, len(train_history) + 1, int(len(train_history) / len(val_history))))[1:]
plt.scatter(steps, val_history, marker='*', s=180, c='red', label='val', zorder=2)
plt.xlabel('train steps')
plt.legend(loc='best')
plt.grid()
plt.show()
def train_model(model, optimizer, train_dataloader, test_dataloader):
criterion = nn.CrossEntropyLoss()
train_loss_log = []
train_acc_log = []
val_loss_log = []
val_acc_log = []
for epoch in range(NUM_EPOCH):
model.train()
train_loss = 0.
train_size = 0
train_acc = 0.
for inputs, labels in train_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
y_pred = model(inputs)
loss = criterion(y_pred, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_size += y_pred.size(0)
train_loss_log.append(loss.data / y_pred.size(0))
_, pred_classes = torch.max(y_pred, 1)
train_acc += (pred_classes == labels).sum().item()
train_acc_log.append(np.mean((pred_classes == labels).cpu().numpy()))
# блок validation
val_loss = 0.
val_size = 0
val_acc = 0.
model.eval()
with torch.no_grad():
for inputs, labels in test_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
y_pred = model(inputs)
loss = criterion(y_pred, labels)
val_loss += loss.item()
val_size += y_pred.size(0)
_, pred_classes = torch.max(y_pred, 1)
val_acc += (pred_classes == labels).sum().item()
val_loss_log.append(val_loss/val_size)
val_acc_log.append(val_acc/val_size)
clear_output()
plot_history(train_loss_log, val_loss_log, 'loss')
plot_history(train_acc_log, val_acc_log, 'accuracy')
print('Train loss:', train_loss / train_size)
print('Train acc:', train_acc / train_size)
print('Val loss:', val_loss / val_size)
print('Val acc:', val_acc / val_size)
Then I train the model:
first_model = FirstModel()
first_model.to(device)
optimizer = optim.RMSprop(first_model.parameters(), lr=0.001, momentum=0.9)
train_model(first_model_rms, optimizer, train_dataloader, test_dataloader)
The loss and accuracy do not change (accuracy at level of 0.1). However, if the optimizer is SGD with momentum everything works fine (loss and accuracy change). I've already tried to change momentum and lr, but it does not help.
What should be fixed? Would be grateful for any possible advice!
So first of all, you don't have to use softmax in the "model" as it is done by the nn.CrossEntropyLoss
, and I also think that the RMSprop doesn't work with momentum.