I am trying to build a multi label text classification model to classify toxic comments. I followed a medium article from this link: Multi-label Text Classification with BERT using Pytorch
I also used this data set from kaggle: jigsaw-toxic-comment-classification-challenge
Im using google colab to run my model with a V100 gpu runtime settings.
Unfortunately after several hours of training (4 epochs), I suffer from a f1 score of only 0.04214842148421484. my final loss score is 0.00354736
I know that the loss function and the f1 score are 2 different things but for my understanding a low cost function score should affect the f1 score. where did i go wrong?
here is the code:
import torch
import numpy as np
import pandas as pd
import shutil, sys
import transformers
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
val_targets=[]
val_outputs=[]
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len,):
self.tokenizer = tokenizer
self.data = dataframe
self.title = dataframe['comment_text']
self.targets = self.data.target_list
self.max_len = max_len
def __len__(self):
return len(self.title)
def __getitem__(self, index):
title = str(self.title[index])
title = " ".join(title.split(" "))
inputs = self.tokenizer.encode_plus(
title,
None,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
return_token_type_ids=True,
truncation=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}
class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
self.l2 = torch.nn.Dropout(0.3)
self.l3 = torch.nn.Linear(768, 6)
def forward(self, ids, mask, token_type_ids):
_, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
output_2 = self.l2(output_1)
output = self.l3(output_2)
return output
def loss_fn(outputs, targets):
return torch.nn.BCEWithLogitsLoss()(outputs, targets)
def save_ckp(state, is_best, checkpoint_path, best_model_path):
"""
state: checkpoint we want to save
is_best: is this the best checkpoint; min validation loss
checkpoint_path: path to save checkpoint
best_model_path: path to save best model
"""
f_path = checkpoint_path
# save checkpoint data to the path given, checkpoint_path
torch.save(state, f_path)
# if it is a best model, min validation loss
if is_best:
best_fpath = best_model_path
# copy that checkpoint file to best path given, best_model_path
shutil.copyfile(f_path, best_fpath)
def load_ckp(checkpoint_fpath, model, optimizer):
"""
checkpoint_path: path to save checkpoint
model: model that we want to load checkpoint parameters into
optimizer: optimizer we defined in previous training
"""
# load checkpoint
checkpoint = torch.load(checkpoint_fpath)
# initialize state_dict from checkpoint to model
model.load_state_dict(checkpoint['state_dict'])
# initialize optimizer from checkpoint to optimizer
optimizer.load_state_dict(checkpoint['optimizer'])
# handle valid_loss_min based on its type
valid_loss_min = checkpoint['valid_loss_min']
if isinstance(valid_loss_min, torch.Tensor):
valid_loss_min = valid_loss_min.item()
# return model, optimizer, epoch value, min validation loss
return model, optimizer, checkpoint['epoch'], valid_loss_min
def train_model(start_epochs, n_epochs, valid_loss_min_input,
training_loader, validation_loader, model,
optimizer, checkpoint_path, best_model_path):
# initialize tracker for minimum validation loss
valid_loss_min = valid_loss_min_input
for epoch in range(start_epochs, n_epochs+1):
train_loss = 0
valid_loss = 0
model.train()
print('############# Epoch {}: Training Start #############'.format(epoch))
for batch_idx, data in enumerate(training_loader):
#print('yyy epoch', batch_idx)
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
optimizer.zero_grad()
outputs = model(ids, mask, token_type_ids)
print(outputs.shape)
loss = loss_fn(outputs, targets)
if batch_idx%100==0:
print(f'Epoch: {epoch}, Training Loss: {loss.item()}')
loss.backward()
optimizer.step()
#print('before loss data in training', loss.item(), train_loss)
train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
#print('after loss data in training', loss.item(), train_loss)
print('############# Epoch {}: Training End #############'.format(epoch))
print('############# Epoch {}: Validation Start #############'.format(epoch))
######################
# validate the model #
######################
model.eval()
outputs, targets = do_validation(validation_loader)
val_preds = (np.array(outputs) > 0.5).astype(int)
val_targets = (np.array(targets) > 0.5).astype(int)
accuracy = metrics.accuracy_score(val_targets, val_preds)
f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print('############# Epoch {}: Validation End #############'.format(epoch))
# calculate average losses
#print('before cal avg train loss', train_loss)
train_loss = train_loss/len(training_loader)
valid_loss = valid_loss/len(validation_loader)
# print training/validation statistics
print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
epoch,
train_loss,
valid_loss
))
# create checkpoint variable and add important data
checkpoint = {
'epoch': epoch + 1,
'valid_loss_min': valid_loss,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict()
}
# save checkpoint
save_ckp(checkpoint, False, checkpoint_path, best_model_path)
## TODO: save the model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,valid_loss))
# save checkpoint as best model
save_ckp(checkpoint, True, checkpoint_path, best_model_path)
valid_loss_min = valid_loss
print('############# Epoch {} Done #############\n'.format(epoch))
return model
def do_validation(dataloader):
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
for _, data in enumerate(dataloader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
outputs = model(ids, mask, token_type_ids)
fin_targets.extend(targets.cpu().detach().numpy().tolist())
fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return fin_outputs, fin_targets
if __name__ == '__main__':
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
train_df = pd.read_csv(train_data_location,on_bad_lines='skip')
test_df = pd.read_csv(test_data_location,on_bad_lines='skip')
select_labels = train_df.columns.values.tolist()[2:]
train_df['target_list'] = train_df[select_labels].values.tolist()
test_df['target_list'] = test_df[select_labels].values.tolist()
MAX_LEN = 64
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
validation_set = CustomDataset(test_df, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)
model = BERTClass()
model.to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
checkpoint_path = '/content/checkpoints/current_checkpoint.pt'
best_model = '/content/checkpoints/best_model.pt'
trained_model = train_model(1, EPOCHS, np.Inf, training_loader, validation_loader, model,
optimizer,checkpoint_path,best_model)
The F1 score is the harmonic mean of precision and recall. It allows the programmer to see precision and recall in a single number. Loss scores are not directly correlated to other performance metrics.
For multi-label text classification, accuracy, precision, and recall are the important metrics. Specifically, examine your overall accuracy score, and then the precision and recall scores per class. Outside of research and commercial purposes, an F1 score is not particularly useful.
As for why you're getting a low F1 score in the first place, did you split your dataset? I see you imported Sklearn's train_test_split library but you never call it in your code. It seems like you're just passing the entire raw dataset to your training function.