pythonmachine-learninghuggingface-transformerstext-classification

How to Change Evaluation Metric from ROC AUC to Accuracy in Hugging Face Transformers Fine-Tuning?


I'm working on a text classification task using the Hugging Face Transformers library in Python. My code is set up to use ROC AUC as the evaluation metric, but I need to change it to accuracy. I've made attempts to modify the code, but I'm running into issues.

Here's a simplified version of the code I'm working with:

# install packages
#!pip install torch transformers memory_profiler datasets accelerate
import time
import datetime
tic = time.time()

# import modules
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Function to define datasets
def create_datasets(X, y):
  # Split Data
  X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=0.2)

  # Call the Tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

  # Tokenize the text
  train_tokens = tokenizer(X_train, truncation=True, padding=True, max_length=512)
  valid_tokens = tokenizer(X_test, truncation=True, padding=True, max_length=512)

  class MakeTorchData(torch.utils.data.Dataset):
      def __init__(self, tokens, labels):
          self.tokens = tokens
          self.labels = labels

      def __getitem__(self, idx):
          item = {k: torch.tensor(v[idx]) for k, v in self.tokens.items()}
          item["labels"] = torch.tensor([self.labels[idx]])
          return item

      def __len__(self):
          return len(self.labels)

  # convert our tokenized data into a torch Dataset
  train_dataset = MakeTorchData(train_tokens, y_train.ravel())
  valid_dataset = MakeTorchData(valid_tokens, y_test.ravel())

  return train_dataset, valid_dataset

# Import the required libraries
metric_name = "roc_auc"
metric = load_metric(metric_name)

# Define metrics
def compute_metrics(eval_pred):

  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  # 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
  return metric.compute(prediction_scores=predictions, references=labels, average="macro")

# Create trainer
# Specifiy the arguments for the trainer
def create_trainer(model_name, train_dataset, valid_dataset, num_epochs=5):
  training_args = TrainingArguments(
      output_dir='./results',          # output directory
      num_train_epochs=num_epochs,     # total number of training epochs
      per_device_train_batch_size=8,   # batch size per device during training
      per_device_eval_batch_size=20,   # batch size for evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
      metric_for_best_model = metric_name,    # select the base metrics
      logging_steps=200,               # log & save weights each logging_steps
      save_steps=200,
      evaluation_strategy="steps",     # evaluate each `logging_steps`
  )

  trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
  return trainer

# Define model name
model_name = "sentence-transformers/all-distilroberta-v1"

# Load the data into a pandas DataFrame | remember to have a similar structure in your Drive so that the data can be read properly.
df = pd.read_csv(r"C:\path\essays.csv", encoding = "latin-1")
df = df.replace({'y': 1, 'n': 0})

# Define X and y and create datasets

# TO-DO: Define X and y
X = df['TEXT']
y = df['cEXT']

# Create datasets
train_dataset, valid_dataset = create_datasets(X, y)

# Define a list of dropout probabilities to iterate through
dropout_probs = [0.1, 0.2, 0.3, 0.4, 0.5]

# Double loop to iterate through all combinations
for hidden_dropout_prob in dropout_probs:
    for attention_probs_dropout_prob in dropout_probs:
        # Define the model with the current dropout probabilities
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,
                                                                  hidden_dropout_prob=hidden_dropout_prob,
                                                                  attention_probs_dropout_prob=attention_probs_dropout_prob).to("cpu")

        # Rest of your code for this specific model configuration
        # Train the model
        trainer = create_trainer(model, train_dataset, valid_dataset, num_epochs=3)
        training_results = trainer.train()  # Capture the training results

        # Evaluate the model
        results = trainer.evaluate()

        # Print or save the results for this combination
        print(f"Model with hidden_dropout_prob={hidden_dropout_prob} and attention_probs_dropout_prob={attention_probs_dropout_prob}:")
        print(f"Training Results: {training_results}")
        print(f"Results: {results}")

In this code, I've defined the compute_metrics function and the evaluation metric as ROC AUC, which works well. However, I would like to replace ROC AUC with accuracy in the evaluation of my models.

I would greatly appreciate any guidance on how to adjust this code to calculate accuracy as the evaluation metric instead of ROC AUC. What modifications should I make to the compute_metrics function and other relevant parts of the code?

My dataset looks like this:

            #AUTHID                                               TEXT  cEXT  \
0  1997_504851.txt  Well, right now I just woke up from a mid-day ...     0   
1  1997_605191.txt  Well, here we go with the stream of consciousn...     0   
2  1997_687252.txt  An open keyboard and buttons to push. The thin...     0   
3  1997_568848.txt  I can't believe it!  It's really happening!  M...     1   
4  1997_688160.txt  Well, here I go with the good old stream of co...     1   

Solution

  • You can modify the compute_metrics function to calculate prediction accuracy:

    from sklearn.metrics import accuracy_score
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        accuracy = accuracy_score(y_true=labels, y_pred=predictions)
        return {"accuracy": accuracy}