[SOLVED] How can I fix this expected CUDA got CPU error in PyTorch?

I've been struggling to find what's wrong in my code. I'm trying to implement DCGAN paper and from the past 2 days, I'm going through these errors. Could anyone please help me fix this?

I'm training this on Google colab with GPU runtime but I'm getting this error. Yesterday, I implemented the first GAN paper by Ian Goodfellow and I did not got this error. I don't know what's happening any help would be appreciated. Also, please check whether the gen_input is correct or not.

I already asked this question and no one replied to the old post. one person replied to it but all he said was to change the lines up and down which also gives the same error. please please help me Here is the code:

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torchvision.utils import save_image
import torch.optim as optim

lr = 0.00002 #learning rate
nc = 3 #color channels
nz = 100 #size of latent vector or size of generator input
ngf = 64 #size of feature maps in generator
ndf = 64 #size of feature maps in discriminator
height = 128 #height of the image
width = 128 #width of the image
num_epochs = 5 #the variable name tells everything
workers = 2 #number of workers to load the data in batches
batch_size = 64 #batch size
image_size = 128 #resizing parameter
root = './simpsons/' #path to the training directory
beta1 = 0.5

img_shape = (nc, height, width)

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.convt1 = nn.ConvTranspose2d(in_channels = nz, out_channels = ngf*8, kernel_size = 4, stride = 1, padding = 0, bias = False)
        self.convt2 = nn.ConvTranspose2d(in_channels = ngf*8, out_channels = ngf*4, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.convt3 = nn.ConvTranspose2d(in_channels = ngf*4, out_channels = ngf*2, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.convt4 = nn.ConvTranspose2d(in_channels = ngf*2, out_channels = ngf, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.convt5 = nn.ConvTranspose2d(in_channels = ngf, out_channels = 3, kernel_size=4, stride = 2, padding = 1, bias = False)

    def forward(self, t):
        t = self.convt1(t)
        t = nn.BatchNorm2d(t)
        t = F.relu(t)

        t = self.convt2(t)
        t = nn.BatchNorm2d(t)
        t = F.relu(t)

        t = self.convt3(t)
        t = nn.BatchNorm2d(t)
        t = F.relu(t)

        t = self.convt4(t)
        t = nn.BatchNorm2d(t)
        t = F.relu(t)

        t = self.convt5(t)
        t = F.tanh(t)

        return t

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = ndf, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.conv2 = nn.Conv2d(in_channels = ndf, out_channels = ndf*2, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.conv3 = nn.Conv2d(in_channels = ndf*2, out_channels = ndf*4, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.conv4 = nn.Conv2d(in_channels = ndf*4, out_channels = ndf*8, kernel_size = 4, stride = 2, padding = 1, bias = False)
        self.conv5 = nn.Conv2d(in_channels = ndf*8, out_channels = 1, kernel_size = 4, stride = 1, padding = 0, bias = False)

    def forward(self, t):
        t = self.conv1(t)
        t = F.leaky_relu(t, 0.2)

        t = self.conv2(t)
        t = nn.BatchNorm2d(t)
        t = F.leaky_relu(t, 0.2)

        t = self.conv3(t)
        t = nn.BatchNorm2d(t)
        t = F.leaky_relu(t, 0.2)

        t = self.conv4(t)
        t = nn.BatchNorm2d(t)
        t = F.leaky_relu(t, 0.2)

        t = self.conv5(t)
        t = F.sigmoid(t)

        return t

def weights_init(m):
    classname = m.__class__.__name__ #returns the class name(eg: Conv2d or ConvTranspose2d)
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02) #0.0 is mean and 0.02 is standard deviation
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1, 0.02) #1 is mean and 0.02 is standard deviation
        nn.init.constant_(m.bias.data, 0.0)

def load_data(image_size, root):
    transform = transforms.Compose([
        transforms.Resize(image_size),
        transforms.ToTensor(),
        transforms.Normalize((0.486, 0.486, 0.486), (0.486, 0.486, 0.486))
        ])

    train_set = torchvision.datasets.ImageFolder(root = root, transform = transform)

    return train_set

#getting the batches of data
train_set = load_data(image_size, root)
dataloader = torch.utils.data.DataLoader(train_set, batch_size = batch_size, shuffle = True, num_workers = workers)

generator = Generator()
discriminator = Discriminator()

generator.apply(weights_init)
discriminator.apply(weights_init)

print(generator)
print(discriminator)

criterion = nn.BCELoss()

noise = torch.randn(64, nz, 1, 1)

optimizer_G = optim.Adam(generator.parameters(), lr = lr, betas=(beta1, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr = lr, betas=(beta1, 0.999))

if torch.cuda.is_available():
    print("CUDA available")
    generator = generator.to('cuda')
    discriminator = discriminator.to('cuda')
    criterion = criterion.cuda('cuda')
    Tensor = torch.cuda.FloatTensor
    print("Networks moved on to cuda")


for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(dataloader):
   
        val = Tensor(images.size(0), 1).fill_(1.0)
        fake = Tensor(images.size(0),1).fill_(0.0)
        
        real_images = images
        
        optimizer_G.zero_grad()
        
        gen_input = Tensor(np.random.normal(0,1,(512,100,4,4)))
        gen = generator(gen_input)
        
        g_loss = loss_func(discriminator(gen), val)
        
        g_loss.backward()
        optimizer_G.step()
        
        optimizer_D.zero_grad()
        
        real_loss = loss_func(discriminator(real_images), val)
        fake_loss = loss_func(discriminator(gen.detach()),fake)
        d_loss = (real_loss + fake_loss)/2
        
        d_loss.backward()
        optimizer_D.step()
        
        if i%900 == 0:
            print("[EPOCH %d/%d] [Batch %d/%d] [D loss: %f] [G loss: %f]"%(epoch, num_epochs, i, len(dataset), d_loss.item(), g_loss.item()))
        
        total_batch = epoch * len(dataset) + i
        if total_batch%400 == 0:
            save_image(gen.data[:25], 'output/%d.png' % total_batch, nrow=5)

And here's the error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-36-0af32f223344> in <module>()
     18         gen_input = gen_input.cuda()
     19         #we then pass it to generator()
---> 20         gen = generator(gen_input) #this returns a image
     21 
     22         #now calculate the loss wrt to discriminator output

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/batchnorm.py in __init__(self, num_features, eps, momentum, affine, track_running_stats)
     40         self.track_running_stats = track_running_stats
     41         if self.affine:
---> 42             self.weight = Parameter(torch.Tensor(num_features))
     43             self.bias = Parameter(torch.Tensor(num_features))
     44         else:

TypeError: expected CPU (got CUDA)

Any help would be appreciated. Thank you!