Bert-Model
for Query Expansion and I am trying to extract the keywords from the Document I havetokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
sentence="This is a sentence"
tokens = tokenizer.tokenize(sentence)
print(tokens,"-tokens")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(\[input_ids\])
print(input_ids)
with torch.no_grad():
output = model(input_ids)
attention_scores = output.attentions
print(attention_scores) #this prints None
this is my code I am using a Simple Sentence and trying to extract keywords from it In order to Do that I need attention of the output part of Bert-Model
I tried with Different tokenizer methods(tokenize,encode,encode_plus) to tokenize and tried with different bert variants (bert-large-uncased)
I want to extract the attention Part form the model output but I am not able to do that
I get None in that place I am not able to get any value in the attention part of the output
you can't really extract keywords from the code you provided. Your output
has no attribute called attentions
, that's why output.attentions
is returning None
, hence the error you're facing.
I benchmarked some transformer models for keyword extraction last year for my university project. I will provide you with a solution from there. This script will return you the extracted keywords with their score as a dictionary.
pip install nltk
pip install spacy
pip install torch
pip install transformers
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import spacy
nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
def pre_process(text):
# lowercase
text=text.lower()
#remove tags
text=re.sub("</?.*?>"," <> ",text)
# remove special characters and digits
text=re.sub("(\\d|\\W)+"," ",text)
##Convert to list from string
text = text.split()
# remove stopwords
text = [word for word in text if word not in stop_words]
# remove words less than three letters
text = [word for word in text if len(word) >= 3]
#lemmatize
lmtzr = WordNetLemmatizer()
text = [lmtzr.lemmatize(word) for word in text]
return ' '.join(text)
def get_candidates(text):
# group of words
n_gram_range = (1, 1)
# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([text])
all_candidates = count.get_feature_names_out()
doc = nlp(text)
noun_phrases = set(chunk.text.strip() for chunk in doc.noun_chunks)
# Only pass the Noun/Adjective keywords for extraction
noun_adj = set()
for token in doc:
if token.pos_ == "NOUN":
noun_adj.add(token.text)
if token.pos_ == "ADJ":
noun_adj.add(token.text)
all_words = noun_adj.union(noun_phrases)
candidates = list(filter(lambda candidate: candidate in all_words, all_candidates))
return candidates
# The keyword extraction method
def keyword_extract(string, model_name):
# Obtaining candidate keywords and document representations
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text=pre_process(string)
candidates=get_candidates(text)
candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
candidate_embeddings = model(**candidate_tokens)["pooler_output"]
# Determination of keywords:
text_tokens = tokenizer([text], padding=True, return_tensors="pt", truncation=True, max_length=512)
text_embedding = model(**text_tokens)["pooler_output"]
candidate_embeddings = candidate_embeddings.detach().numpy()
text_embedding = text_embedding.detach().numpy()
distances = cosine_similarity(text_embedding, candidate_embeddings)
keywords = {i:j for i,j in zip(candidates, distances[0])}
# sort based on attention score and return the dictionary
return dict(sorted(keywords.items(), key=lambda x:x[1], reverse=True))
# put the model name here
model_name = "bert-base-uncased"
my_sentence = "Hello there! This is an example sentence and this is the code that I'm using to extract keywords from a sentence"
keywords = keyword_extract(my_sentence, model_name)
print(keywords)
Output returned the keywords sorted according to their score
{'keywords': 0.97554314, 'example': 0.94142365, 'extract': 0.885766, 'code': 0.83722556, 'sentence': 0.76782566}