machine-learningpytorchbert-language-modeltext-classification

Low f1 score and also low loss function score


I am trying to build a multi label text classification model to classify toxic comments. I followed a medium article from this link: Multi-label Text Classification with BERT using Pytorch

I also used this data set from kaggle: jigsaw-toxic-comment-classification-challenge

Im using google colab to run my model with a V100 gpu runtime settings.

Unfortunately after several hours of training (4 epochs), I suffer from a f1 score of only 0.04214842148421484. my final loss score is 0.00354736

I know that the loss function and the f1 score are 2 different things but for my understanding a low cost function score should affect the f1 score. where did i go wrong?

here is the code:

import torch
import numpy as np
import pandas as pd
import shutil, sys
import transformers
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

val_targets=[]
val_outputs=[]

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len,):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe['comment_text']
        self.targets = self.data.target_list
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split(" "))

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
    

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load checkpoint
    checkpoint = torch.load(checkpoint_fpath)

    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])

    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])

    # handle valid_loss_min based on its type
    valid_loss_min = checkpoint['valid_loss_min']
    if isinstance(valid_loss_min, torch.Tensor):
        valid_loss_min = valid_loss_min.item()

    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min


def train_model(start_epochs,  n_epochs, valid_loss_min_input,
                training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):

  # initialize tracker for minimum validation loss
  valid_loss_min = valid_loss_min_input


  for epoch in range(start_epochs, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)
        print(outputs.shape)

        loss = loss_fn(outputs, targets)
        if batch_idx%100==0:
           print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()


    outputs, targets = do_validation(validation_loader)
    val_preds = (np.array(outputs) > 0.5).astype(int)
    val_targets = (np.array(targets) > 0.5).astype(int)
    accuracy = metrics.accuracy_score(val_targets, val_preds)
    f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
    f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

          
    print('############# Epoch {}: Validation End     #############'.format(epoch))
    # calculate average losses
    #print('before cal avg train loss', train_loss)
    train_loss = train_loss/len(training_loader)
    valid_loss = valid_loss/len(validation_loader)
    # print training/validation statistics
    print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
          epoch,
          train_loss,
          valid_loss
          ))

    # create checkpoint variable and add important data
    checkpoint = {
          'epoch': epoch + 1,
          'valid_loss_min': valid_loss,
          'state_dict': model.state_dict(),
          'optimizer': optimizer.state_dict()
    }

      # save checkpoint
    save_ckp(checkpoint, False, checkpoint_path, best_model_path)

    ## TODO: save the model if validation loss has decreased
    if valid_loss <= valid_loss_min:
      print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
      # save checkpoint as best model
      save_ckp(checkpoint, True, checkpoint_path, best_model_path)
      valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))


  return model


def do_validation(dataloader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(dataloader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


if __name__ == '__main__':

    # If there's a GPU available...
    if torch.cuda.is_available():

        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")

        print('There are %d GPU(s) available.' % torch.cuda.device_count())

        print('We will use the GPU:', torch.cuda.get_device_name(0))

    # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    train_df =  pd.read_csv(train_data_location,on_bad_lines='skip')
    test_df = pd.read_csv(test_data_location,on_bad_lines='skip')
    select_labels = train_df.columns.values.tolist()[2:]
    train_df['target_list'] = train_df[select_labels].values.tolist()
    test_df['target_list'] = test_df[select_labels].values.tolist()
    MAX_LEN = 64
    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE = 8
    EPOCHS = 10
    LEARNING_RATE = 1e-05
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
    validation_set = CustomDataset(test_df, tokenizer, MAX_LEN)
    train_params = {'batch_size': TRAIN_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }

    test_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': False,
                    'num_workers': 0
                    }

    training_loader = DataLoader(training_set, **train_params)
    validation_loader = DataLoader(validation_set, **test_params)
    model = BERTClass()
    model.to(device)
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
    checkpoint_path = '/content/checkpoints/current_checkpoint.pt'
    best_model = '/content/checkpoints/best_model.pt'
    trained_model = train_model(1, EPOCHS, np.Inf, training_loader, validation_loader, model,
                        optimizer,checkpoint_path,best_model)
    

 

Solution

  • The F1 score is the harmonic mean of precision and recall. It allows the programmer to see precision and recall in a single number. Loss scores are not directly correlated to other performance metrics.

    For multi-label text classification, accuracy, precision, and recall are the important metrics. Specifically, examine your overall accuracy score, and then the precision and recall scores per class. Outside of research and commercial purposes, an F1 score is not particularly useful.

    As for why you're getting a low F1 score in the first place, did you split your dataset? I see you imported Sklearn's train_test_split library but you never call it in your code. It seems like you're just passing the entire raw dataset to your training function.