
Huggingface Bert: Output Printing

I'm new to coding, and could use guidance as to why it is printing oddly like it is. While this is related to NLP, I believe this error could most likely be explained by somebody who has greater knowledge in coding than me. I hope this is the right place to ask this question. Thank you for the help!

from transformers import AutoTokenizer, AutoModelWithLMHead
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")

model = AutoModelWithLMHead.from_pretrained("bert-large-cased-whole-word-masking")

sentence = """While United States [MASK] heed human rights,"""

token_ids = tokenizer.encode(sentence, return_tensors='pt')
# print(token_ids)
token_ids_tk = tokenizer.tokenize(sentence, return_tensors='pt')

masked_position = (token_ids.squeeze() == tokenizer.mask_token_id).nonzero()
masked_pos = [mask.item() for mask in masked_position ]
print (masked_pos)

with torch.no_grad():
    output = model(token_ids)

last_hidden_state = output[0].squeeze()

print ("\n\n")
print ("sentence :", sentence)
print ("\n")
list_of_list =[]
for mask_index in masked_pos:
    mask_hidden_state = last_hidden_state[mask_index]
    idx = torch.topk(mask_hidden_state, k=25, dim=0)[1]
    words = [tokenizer.decode(i.item()).strip() for i in idx]
    print (words)

best_guess = ""
for j in list_of_list:
    best_guess = best_guess+" "+j[0]

print ("\nBest guess for fill in the blank :::",best_guess)


['While', 'United', 'States', '[MASK]', 'he', '##ed', 'human', 'rights', ',']


sentence : While United States [MASK] heed human rights,

['m u s t', 'c i t i z e n s', 's h o u l d', 'c a n n o t', 'l a w s', 'd o e s', 'g e n e r a l l y', 'd i d', 'a l w a y s', 'l a w', ',', 'g o v e r n m e n t', 'd o', 'p o l i t i c i a n s', 'm a y', 'd e f e n d e r s', 'c o u n t r i e s', 'c a n', 'o f f i c i a l s', 'g o v e r n m e n t s', 'w i l l', 'G o v e r n m e n t', 'v a l u e s', 'C o n s t i t u t i o n', 'p e o p l e']

Best guess for fill in the blank :::  m u s t


  • so first thing that you have to understand is the tokenised output given by BERT

    if you look at the output it is already spaced (I have written some print statements that will make it clear)

    If you just want perfect output: change the lines where I have added comments

    !pip3 install transformers
    from transformers import AutoTokenizer, AutoModelWithLMHead
    import torch
    tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
    model = AutoModelWithLMHead.from_pretrained("bert-large-cased-whole-word-masking")
    sentence = """While United States [MASK] heed human rights,"""
    token_ids = tokenizer.encode(sentence, return_tensors='pt')
    # print(token_ids)
    token_ids_tk = tokenizer.tokenize(sentence, return_tensors='pt')
    masked_position = (token_ids.squeeze() == tokenizer.mask_token_id).nonzero()
    masked_pos = [mask.item() for mask in masked_position ]
    print (masked_pos)
    with torch.no_grad():
        output = model(token_ids)
    last_hidden_state = output[0].squeeze()
    print ("\n\n")
    print ("sentence :", sentence)
    print ("\n")
    list_of_list =[]
    for mask_index in masked_pos:
        mask_hidden_state = last_hidden_state[mask_index]
        idx = torch.topk(mask_hidden_state, k=25, dim=0)[1]
        for i in idx: print(i,tokenizer.decode(i.item()).strip()) 
        words = [tokenizer.decode(i.item()).strip().replace(" ","") for i in idx] ## REMOVING ANY SPACES ADDED WHILE COMBINING TOKENS
        print ("WORDS",words)
    best_guess = ""
    for j in list_of_list:
        best_guess = (best_guess+" "+j[0]).strip() ## ADD THIS TO REMOVE TRAILING SPACES
    print ("\nBest guess for fill in the blank :::",best_guess)