python nlp huggingface-transformers bert-language-model word-embedding

How to get Attentions Part from the output of a Bert model?

I am using `Bert-Model` for Query Expansion and I am trying to extract the keywords from the Document I have

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
sentence="This is a sentence"
tokens = tokenizer.tokenize(sentence)
print(tokens,"-tokens")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(\[input_ids\])
print(input_ids)
with torch.no_grad():
output = model(input_ids)
attention_scores = output.attentions
print(attention_scores)   #this prints None

this is my code I am using a Simple Sentence and trying to extract keywords from it In order to Do that I need attention of the output part of Bert-Model

I tried with Different tokenizer methods(tokenize,encode,encode_plus) to tokenize and tried with different bert variants (bert-large-uncased)

I want to extract the attention Part form the model output but I am not able to do that

I get None in that place I am not able to get any value in the attention part of the output

Solution

you can't really extract keywords from the code you provided. Your output has no attribute called attentions, that's why output.attentions is returning None, hence the error you're facing.

I benchmarked some transformer models for keyword extraction last year for my university project. I will provide you with a solution from there. This script will return you the extracted keywords with their score as a dictionary.

Solution

Install the dependencies

pip install nltk
pip install spacy
pip install torch
pip install transformers

Necessary imports

from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import spacy

nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

Utility functions

def pre_process(text):
    # lowercase
    text=text.lower()
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    ##Convert to list from string
    text = text.split()
    # remove stopwords
    text = [word for word in text if word not in stop_words]
    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]
    #lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    return ' '.join(text)

def get_candidates(text):
    # group of words
    n_gram_range = (1, 1)

    # Extract candidate words/phrases
    count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([text])
    all_candidates = count.get_feature_names_out()
    
    doc = nlp(text)
    noun_phrases = set(chunk.text.strip() for chunk in doc.noun_chunks)

    # Only pass the Noun/Adjective keywords for extraction
    noun_adj = set()
    for token in doc:
        if token.pos_ == "NOUN":
            noun_adj.add(token.text)
        if token.pos_ == "ADJ":
            noun_adj.add(token.text)

    all_words = noun_adj.union(noun_phrases)
    candidates = list(filter(lambda candidate: candidate in all_words, all_candidates))
    return candidates

Keyword Extraction Method

# The keyword extraction method
def keyword_extract(string, model_name):

    # Obtaining candidate keywords and document representations
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    text=pre_process(string)
    candidates=get_candidates(text)
    candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
    candidate_embeddings = model(**candidate_tokens)["pooler_output"]

    # Determination of keywords:
    text_tokens = tokenizer([text], padding=True, return_tensors="pt", truncation=True, max_length=512)
    text_embedding = model(**text_tokens)["pooler_output"]

    candidate_embeddings = candidate_embeddings.detach().numpy()
    text_embedding = text_embedding.detach().numpy()

    distances = cosine_similarity(text_embedding, candidate_embeddings)
    keywords = {i:j for i,j in zip(candidates, distances[0])}

    # sort based on attention score and return the dictionary
    return dict(sorted(keywords.items(), key=lambda x:x[1], reverse=True))

# put the model name here
model_name = "bert-base-uncased"

my_sentence = "Hello there! This is an example sentence and this is the code that I'm using to extract keywords from a sentence"

keywords = keyword_extract(my_sentence, model_name)
print(keywords)

Output

Output returned the keywords sorted according to their score

{'keywords': 0.97554314, 'example': 0.94142365, 'extract': 0.885766, 'code': 0.83722556, 'sentence': 0.76782566}

How to get Attentions Part from the output of a Bert model?

I am using Bert-Model for Query Expansion and I am trying to extract the keywords from the Document I have

Solution

Output

I am using `Bert-Model` for Query Expansion and I am trying to extract the keywords from the Document I have