pythondeep-learningpytorchautoencoderautograd

How to implement contractive autoencoder in Pytorch?


I'm trying to create a contractive autoencoder in Pytorch. I found this thread and tried according to that. This is the snippet I wrote based on the mentioned thread:

import datetime
import numpy as np 
import torch
import torchvision
from torchvision import datasets, transforms
from torchvision.utils import save_image, make_grid
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
import matplotlib.pyplot as plt 
%matplotlib inline

dataset_train = datasets.MNIST(root='MNIST',
                               train=True,
                               transform = transforms.ToTensor(),
                               download=True)
dataset_test  = datasets.MNIST(root='MNIST', 
                               train=False, 
                               transform = transforms.ToTensor(),
                               download=True)
batch_size = 128
num_workers = 2
dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                               batch_size = batch_size,
                                               shuffle=True,
                                               num_workers = num_workers, 
                                               pin_memory=True)

dataloader_test = torch.utils.data.DataLoader(dataset_test,
                                               batch_size = batch_size,
                                               num_workers = num_workers,
                                               pin_memory=True)

def view_images(imgs, labels, rows = 4, cols =11):
    imgs = imgs.detach().cpu().numpy().transpose(0,2,3,1)
    fig = plt.figure(figsize=(8,4))
    for i in range(imgs.shape[0]):
        ax = fig.add_subplot(rows, cols, i+1, xticks=[], yticks=[])
        ax.imshow(imgs[i].squeeze(), cmap='Greys_r')
        ax.set_title(labels[i].item())


# now let's view some 
imgs, labels = next(iter(dataloader_train))
view_images(imgs, labels,13,10)

class Contractive_AutoEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Linear(784, 512)
        self.decoder = nn.Linear(512, 784)

    def forward(self, input):
        # flatten the input
        shape = input.shape
        input = input.view(input.size(0), -1)
        output_e = F.relu(self.encoder(input))
        output = F.sigmoid(self.decoder(output_e))
        output = output.view(*shape)
        return output_e, output

def loss_function(output_e, outputs, imgs, device):
    output_e.backward(torch.ones(output_e.size()).to(device), retain_graph=True)
    criterion = nn.MSELoss()
    assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
    
    imgs.grad.requires_grad = True 
    loss1 = criterion(outputs, imgs)
    print(imgs.grad)
    loss2 = torch.mean(pow(imgs.grad,2))
    loss = loss1 + loss2 
    return loss 

epochs = 50 
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Contractive_AutoEncoder().to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001)

for e in range(epochs):
    for i, (imgs, labels) in enumerate(dataloader_train):
        imgs = imgs.to(device)
        labels = labels.to(device)

        outputs_e, outputs = model(imgs)
        loss = loss_function(outputs_e, outputs, imgs,device)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i%interval: 
            print('')

    print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')

For the sake of brevity I just used one layer for the encoder and the decoder. It should work regardless of number of layers in either of them obviously!

But the catch here is, aside from the fact that I don't know if this is the correct way of doing this, (calculating gradients with respect to the input), I get an error which makes the former solution wrong/not applicable.

That is:

imgs.grad.requires_grad = True

produces the error :

AttributeError : 'NoneType' object has no attribute 'requires_grad'

I also tried the second method suggested in that thread which is as follows:

class Contractive_Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Linear(784, 512)
        
    def forward(self, input):
        # flatten the input
        input = input.view(input.size(0), -1)
        output_e = F.relu(self.encoder(input))
        return output_e

class Contractive_Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = nn.Linear(512, 784)

    def forward(self, input):
        # flatten the input
        output = F.sigmoid(self.decoder(input))
        output = output.view(-1,1,28,28)
        return output


epochs = 50 
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_enc = Contractive_Encoder().to(device)
model_dec = Contractive_Decoder().to(device)

optimizer = optim.Adam([{"params":model_enc.parameters()},
                        {"params":model_dec.parameters()}], lr =0.001)

optimizer_cond = optim.Adam(model_enc.parameters(), lr = 0.001)

criterion = nn.MSELoss()

for e in range(epochs):
    for i, (imgs, labels) in enumerate(dataloader_train):
        imgs = imgs.to(device)
        labels = labels.to(device)

        outputs_e = model_enc(imgs)
        outputs = model_dec(outputs_e)
        loss_rec = criterion(outputs, imgs)
        optimizer.zero_grad()
        loss_rec.backward()
        optimizer.step()

        imgs.requires_grad_(True)
        y = model_enc(imgs)
        optimizer_cond.zero_grad()
        y.backward(torch.ones(imgs.view(-1,28*28).size()))

        imgs.grad.requires_grad = True
        loss = torch.mean([pow(imgs.grad,2)])
        optimizer_cond.zero_grad()
        loss.backward()
        optimizer_cond.step()
        
        if i%interval: 
            print('')

    print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')

but I face the error :

RuntimeError: invalid gradient at index 0 - got [128, 784] but expected shape compatible with [128, 512]

How should I go about this in Pytorch?


Solution

  • Summary

    The final implementation for contractive loss that I wrote is as follows:

    def loss_function(output_e, outputs, imgs, lamda = 1e-4, device=torch.device('cuda')):
    
        criterion = nn.MSELoss()
        assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
        loss1 = criterion(outputs, imgs)
    
        output_e.backward(torch.ones(outputs_e.size()).to(device), retain_graph=True)    
        # Frobenious norm, the square root of sum of all elements (square value)
        # in a jacobian matrix 
        loss2 = torch.sqrt(torch.sum(torch.pow(imgs.grad,2)))
        imgs.grad.data.zero_()
        loss = loss1 + (lamda*loss2) 
        return loss 
    

    and inside training loop you need to do:

    for e in range(epochs):
        for i, (imgs, labels) in enumerate(dataloader_train):
            imgs = imgs.to(device)
            labels = labels.to(device)
    
            imgs.retain_grad()
            imgs.requires_grad_(True)
    
            outputs_e, outputs = model(imgs)
            loss = loss_function(outputs_e, outputs, imgs, lam,device)
    
            imgs.requires_grad_(False)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'epoch/epochs: {e}/{epochs} loss: {loss.item():.4f}')
    

    Full explanation

    As it turns out and rightfully @akshayk07 pointed out in the comments, the implementation found in Pytorch forum was wrong in multiple places. The notable thing, being it wasn't implementing the actual contractive loss that was introduced in Contractive Auto-Encoders:Explicit Invariance During Feature Extraction paper! and also aside from that, the implementation wouldn't work at all for obvious reasons that will be explained in a moment.

    The changes are obvious so I try to explain what's going on here. First of all note that imgs is not a leaf node, so the gradients would not be retained in the image .grad attribute.

    In order to retain gradients for non leaf nodes, you should use retain_graph(). grad is only populated for leaf Tensors. Also imgs.retain_grad() should be called before doing forward() as it will instruct the autograd to store grads into non-leaf nodes.

    Update

    Thanks to @Michael for pointing out that the correct calculation of Frobenius Norm is actually (from ScienceDirect):

    the square root of the sum of the squares of all the matrix entries

    and not

    the the square root of the sum of the absolute values of all the matrix entries as explained here