I'm using ResNet18 from Hugging Face to fine-tune a multi-label dataset. I want to be able to predict for an image it's 3 corresponding labels and for that I created 3 fully connected layers. First, I tried updating the classifier layer of ResNet18:
model2.classifier_artist = torch.nn.Sequential(
torch.nn.Dropout(p=0.2, inplace=True),
torch.nn.Linear(in_features=512, out_features=num_classes_artist, bias=True)
).to(device)
model2.classifier_style = torch.nn.Sequential(
torch.nn.Dropout(p=0.2, inplace=True),
torch.nn.Linear(in_features=512, out_features=num_classes_style, bias=True)
).to(device)
model2.classifier_genre = torch.nn.Sequential(
torch.nn.Dropout(p=0.2, inplace=True),
torch.nn.Linear(in_features=512, out_features=num_classes_genre, bias=True)
).to(device)
num_classes_artist = 129
num_classes_style = 27
num_classes_genre = 11
But that didn't work. The model architecture didn't include the 3 classifiers that I added. Here is the summary from torchinfo:
ResNetForImageClassification (ResNetForImageClassification) [32, 3, 224, 224] [32, 1000]
├─ResNetModel (resnet) [32, 3, 224, 224] [32, 512, 1, 1]
│ └─ResNetEmbeddings (embedder) [32, 3, 224, 224] [32, 64, 56, 56]
│ │ └─ResNetConvLayer (embedder) [32, 3, 224, 224] [32, 64, 112, 112]
│ │ └─MaxPool2d (pooler) [32, 64, 112, 112] [32, 64, 56, 56]
│ └─ResNetEncoder (encoder) [32, 64, 56, 56] [32, 512, 7, 7]
│ │ └─ModuleList (stages) -- --
│ └─AdaptiveAvgPool2d (pooler) [32, 512, 7, 7] [32, 512, 1, 1]
├─Sequential (classifier) [32, 512, 1, 1] [32, 1000]
│ └─Flatten (0) [32, 512, 1, 1] [32, 512]
│ └─Linear (1) [32, 512] [32, 1000]
After that I proceeded to implement it in PyTorch:
class WikiartModel(nn.Module):
def __init__(self, num_artists, num_genres, num_styles):
super(WikiartModel, self).__init__()
# Shared Convolutional Layers
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding =1)
self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
# Artist classification branch
self.fc_artist1 = nn.Linear(256 * 16 * 16, 512)
self.fc_artist2 = nn.Linear(512, num_artists)
# Genre classification branch
self.fc_genre1 = nn.Linear(256 * 16 * 16, 512)
self.fc_genre2 = nn.Linear(512, num_genres)
# Style classification branch
self.fc_style1 = nn.Linear(256 * 16 * 16, 512)
self.fc_style2 = nn.Linear(512, num_styles)
def forward(self, x):
# Shared convolutional layers
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(-1, 256 * 16 * 16)
# Artist classification branch
artists_out = F.relu(self.fc_artist1(x))
artists_out = self.fc_artist2(artists_out)
# Genre classification branch
genre_out = F.relu(self.fc_genre1(x))
genre_out = self.fc_genre2(genre_out)
# Style classification branch
style_out = F.relu(self.fc_style1(x))
style_out = self.fc_style2(style_out)
return artists_out, genre_out, style_out
# Set the number of classes for each task
num_artists = 129 # Including "Unknown Artist"
num_genres = 11 # Including "Unknown Genre"
num_styles = 27
And here is the torchinfo summary:
Layer (type (var_name)) Input Shape Output Shape
================================================================================
WikiartModel (WikiartModel) [32, 3, 224, 224] [98, 129]
├─Conv2d (conv1) [32, 3, 224, 224] [32, 64, 224, 224]
├─MaxPool2d (pool) [32, 64, 224, 224] [32, 64, 112, 112]
├─Conv2d (conv2) [32, 64, 112, 112] [32, 128, 112, 112]
├─MaxPool2d (pool) [32, 128, 112, 112] [32, 128, 56, 56]
├─Conv2d (conv3) [32, 128, 56, 56] [32, 256, 56, 56]
├─MaxPool2d (pool) [32, 256, 56, 56] [32, 256, 28, 28]
├─Linear (fc_artist1) [98, 65536] [98, 512]
├─Linear (fc_artist2) [98, 512] [98, 129]
├─Linear (fc_genre1) [98, 65536] [98, 512]
├─Linear (fc_genre2) [98, 512] [98, 11]
├─Linear (fc_style1) [98, 65536] [98, 512]
├─Linear (fc_style2) [98, 512] [98, 27]
The batch size of the input data ([32, 3, 224, 224])
and the batch size of the model's output predictions ([98, 129])
appear to be different. I've checked my data loading, model architecture, and training loop, but I can't seem to identify the root cause of this problem. This inconsistency is leading to an error when calculating the loss inside the training loop:
loss_artist = criterion_artist(outputs_artist, labels_artist)
:
ValueError: Expected input batch_size (98) to match target batch_size (32).
In your model architecture, you've defined 3 types of 2D convolutional layers (self.conv1
, self.conv2
, self.conv3
) and one max-pooling layer (self.pool
).
For the input tensor size [32, 3, 224, 224]
, after passing through the specified layers:
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding =1)
self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
Based on your input tensor size - [32, 3, 224, 224]
, after the image passing through
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
, the size of x
would become [32, 256, 28, 28]
.
Now, you'd like to utilize the fully-connected network in the subsequent module. To do this, you should replace the line:
x = x.view(-1, 256 * 16 * 16)
with
x = x.view(x.size(0),-1)
This modification flattens the tensor to the shape [32, 200704]
(256 * 28 * 28 = 200704). Consequently, you should adjust:
nn.Linear(256 * 16 * 16, 512)
to
nn.Linear(256 * 28 * 28, 512)
The revised WikiartModel
class is as follows:
class WikiartModel(nn.Module):
def __init__(self, num_artists, num_genres, num_styles):
super(WikiartModel, self).__init__()
# Shared Convolutional Layers
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding =1)
self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.size = 28
# Artist classification branch
self.fc_artist1 = nn.Linear(256 * self.size * self.size, 512)
self.fc_artist2 = nn.Linear(512, num_artists)
# Genre classification branch
self.fc_genre1 = nn.Linear(256 * self.size * self.size, 512)
self.fc_genre2 = nn.Linear(512, num_genres)
# Style classification branch
self.fc_style1 = nn.Linear(256 * self.size * self.size, 512)
self.fc_style2 = nn.Linear(512, num_styles)
def forward(self, x):
# Shared convolutional layers
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(x.size(0),-1)
# Artist classification branch
artists_out = F.relu(self.fc_artist1(x))
artists_out = self.fc_artist2(artists_out)
# Genre classification branch
genre_out = F.relu(self.fc_genre1(x))
genre_out = self.fc_genre2(genre_out)
# Style classification branch
style_out = F.relu(self.fc_style1(x))
style_out = self.fc_style2(style_out)
return artists_out, genre_out, style_out
# Set the number of classes for each task
num_artists = 129 # Including "Unknown Artist"
num_genres = 11 # Including "Unknown Genre"
num_styles = 27