pythonnlphuggingface-transformersbert-language-modelword-embedding

How to get Attentions Part from the output of a Bert model?


I am using Bert-Model for Query Expansion and I am trying to extract the keywords from the Document I have

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
sentence="This is a sentence"
tokens = tokenizer.tokenize(sentence)
print(tokens,"-tokens")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(\[input_ids\])
print(input_ids)
with torch.no_grad():
output = model(input_ids)
attention_scores = output.attentions
print(attention_scores)   #this prints None

this is my code I am using a Simple Sentence and trying to extract keywords from it In order to Do that I need attention of the output part of Bert-Model

I tried with Different tokenizer methods(tokenize,encode,encode_plus) to tokenize and tried with different bert variants (bert-large-uncased)

I want to extract the attention Part form the model output but I am not able to do that

I get None in that place I am not able to get any value in the attention part of the output


Solution

  • you can't really extract keywords from the code you provided. Your output has no attribute called attentions, that's why output.attentions is returning None, hence the error you're facing.

    I benchmarked some transformer models for keyword extraction last year for my university project. I will provide you with a solution from there. This script will return you the extracted keywords with their score as a dictionary.

    Solution

    pip install nltk
    pip install spacy
    pip install torch
    pip install transformers
    
    from transformers import AutoModel, AutoTokenizer
    from sklearn.metrics.pairwise import cosine_similarity
    import nltk
    from nltk.stem.wordnet import WordNetLemmatizer
    import re
    from nltk.corpus import stopwords
    from sklearn.feature_extraction.text import CountVectorizer
    import spacy
    
    nlp = spacy.load('en_core_web_sm')
    nltk.download('stopwords')
    nltk.download('wordnet')
    stop_words = set(stopwords.words('english'))
    
    def pre_process(text):
        # lowercase
        text=text.lower()
        #remove tags
        text=re.sub("</?.*?>"," <> ",text)
        # remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)
        ##Convert to list from string
        text = text.split()
        # remove stopwords
        text = [word for word in text if word not in stop_words]
        # remove words less than three letters
        text = [word for word in text if len(word) >= 3]
        #lemmatize
        lmtzr = WordNetLemmatizer()
        text = [lmtzr.lemmatize(word) for word in text]
        return ' '.join(text)
    
    def get_candidates(text):
        # group of words
        n_gram_range = (1, 1)
    
        # Extract candidate words/phrases
        count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([text])
        all_candidates = count.get_feature_names_out()
        
        doc = nlp(text)
        noun_phrases = set(chunk.text.strip() for chunk in doc.noun_chunks)
    
        # Only pass the Noun/Adjective keywords for extraction
        noun_adj = set()
        for token in doc:
            if token.pos_ == "NOUN":
                noun_adj.add(token.text)
            if token.pos_ == "ADJ":
                noun_adj.add(token.text)
    
        all_words = noun_adj.union(noun_phrases)
        candidates = list(filter(lambda candidate: candidate in all_words, all_candidates))
        return candidates
    
    # The keyword extraction method
    def keyword_extract(string, model_name):
    
        # Obtaining candidate keywords and document representations
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    
        text=pre_process(string)
        candidates=get_candidates(text)
        candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
        candidate_embeddings = model(**candidate_tokens)["pooler_output"]
    
        # Determination of keywords:
        text_tokens = tokenizer([text], padding=True, return_tensors="pt", truncation=True, max_length=512)
        text_embedding = model(**text_tokens)["pooler_output"]
    
        candidate_embeddings = candidate_embeddings.detach().numpy()
        text_embedding = text_embedding.detach().numpy()
    
        distances = cosine_similarity(text_embedding, candidate_embeddings)
        keywords = {i:j for i,j in zip(candidates, distances[0])}
    
        # sort based on attention score and return the dictionary
        return dict(sorted(keywords.items(), key=lambda x:x[1], reverse=True))
    
    # put the model name here
    model_name = "bert-base-uncased"
    
    my_sentence = "Hello there! This is an example sentence and this is the code that I'm using to extract keywords from a sentence"
    
    keywords = keyword_extract(my_sentence, model_name)
    print(keywords)
    

    Output

    Output returned the keywords sorted according to their score

    {'keywords': 0.97554314, 'example': 0.94142365, 'extract': 0.885766, 'code': 0.83722556, 'sentence': 0.76782566}