machine-learning pytorch nlp huggingface-transformers text-classification

Error in getting Captum text explanations for text classification

I have the following code that I am using to identify the most influential words used to correctly predict the text in the test dataset

import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from captum.attr import IntegratedGradients

# Loading data
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data(df, tokenizer, max_len=128):
    inputs = tokenizer(list(df['text']), padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    labels = torch.tensor(df['label'].values)
    return inputs, labels

train_inputs, train_labels = preprocess_data(train_df, tokenizer)
test_inputs, test_labels = preprocess_data(test_df, tokenizer)

# DataLoader
train_dataset = torch.utils.data.TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} loss: {loss.item()}")

# Evaluation
model.eval()
correct_predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        correct_predictions.extend(
            (preds == labels).cpu().numpy().tolist()
        )
accuracy = accuracy_score(test_labels.numpy(), correct_predictions)
print(f"Test Accuracy: {accuracy:.2f}")

# Integrated Gradients
ig = IntegratedGradients(model)

def get_influential_words(input_text, model, tokenizer, ig, device):
    model.eval()
    # Tokenizing the input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    input_ids = inputs['input_ids'].to(device, dtype=torch.long)  # Explicitly convert to LongTensor
    attention_mask = inputs['attention_mask'].to(device, dtype=torch.long)  # Explicitly convert to LongTensor

    print("Input IDs shape:", input_ids.shape, "dtype:", input_ids.dtype)
    print("Attention mask shape:", attention_mask.shape, "dtype:", attention_mask.dtype)
    # forward function for IG
    def forward_func(input_ids):
        outputs = model(input_ids, attention_mask=attention_mask)
        return outputs.logits

    # Applying Integrated Gradients
    attributions, delta = ig.attribute(input_ids, target=1, return_convergence_delta=True)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
    token_importances = attributions.sum(dim=2).squeeze(0).detach().cpu().numpy()

    return list(zip(tokens, token_importances))

# Analysing influential words for correctly predicted texts
for idx, correct in enumerate(correct_predictions):
    if correct:
        influential_words = get_influential_words(test_df['text'].iloc[idx], model, tokenizer, ig, device)
        print(f"Influential words for text: {test_df['text'].iloc[idx]}")
        print(influential_words)

But I am getting the following error in running the above.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
Epoch 1 loss: 0.4719192385673523
Epoch 2 loss: 0.39585667848587036
Epoch 3 loss: 0.14659778773784637
Test Accuracy: 0.70
Input IDs shape: torch.Size([1, 8]) dtype: torch.int64
Attention mask shape: torch.Size([1, 8]) dtype: torch.int64
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-9-f047b509c98d> in <cell line: 90>()
     90 for idx, correct in enumerate(correct_predictions):
     91     if correct:
---> 92         influential_words = get_influential_words(test_df['text'].iloc[idx], model, tokenizer, ig, device)
     93         print(f"Influential words for text: {test_df['text'].iloc[idx]}")
     94         print(influential_words)

18 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2549         # remove once script supports set_grad_enabled
   2550         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2551     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2552 
   2553 

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

Solution

You need to slightly change the gradients calculation class. Also, you didn't include forward_func into the gradients class constructor, so the attribute method was not able to launch the stuff properly.

I think that using LayerIntegratedGradients is better for debugging BERT - in line with this tutorial https://captum.ai/tutorials/Bert_SQUAD_Interpret

Below please find snippet that works:

from captum.attr import LayerIntegratedGradients


def custom_forward(inputs):
    preds = predict(inputs)
    return torch.softmax(preds, dim = 1)[0][1].unsqueeze(-1)
lig = LayerIntegratedGradients(custom_forward, model.bert.embeddings)
def get_influential_words(input_text, model, tokenizer, ig, device):
    model.eval()
    # Tokenizing the input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    # print("Input IDs shape:", input_ids.shape, "dtype:", input_ids.dtype)
    # print("Attention mask shape:", attention_mask.shape, "dtype:", attention_mask.dtype)

    attributions, delta = lig.attribute(input_ids, return_convergence_delta=True)
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
    token_importances = attributions.sum(dim=2).squeeze(0).detach().cpu().numpy()

    return list(zip(tokens, token_importances))

results = []

for idx, correct in enumerate(correct_predictions):
    if correct:
        influential_words = get_influential_words(test_df['text'].iloc[idx], model, tokenizer, ig, device)
        print(f"Influential words for text: {test_df['text'].iloc[idx]}")
        print(influential_words)