how can we get the prediction logits in the lxmert model using hugging face library? It's fairly easy to get in visualbert, but I'm not able to get it with the lxmert model. In case of visualbert model, the keys I'm getting are :
['prediction_logits', 'seq_relationship_logits', 'attentions']
and with the help of lxmert mode, the keys are :
['language_output', 'vision_output', 'pooled_output', 'language_attentions', 'vision_attentions', 'cross_encoder_attentions']
Even though there's a mention of prediction logits in the documentation I am not able to get them, if someone can help that would be great.
EDIT : Link to colab notebook for lxmert.
Use LxmertForPreTraining instead of LxmertModel:
###Colab commands
#pip install transformers
#!git clone https://github.com/huggingface/transformers
#cd transformers
#cd examples/research_projects/lxmert
#pip install wget
from IPython.display import clear_output, Image, display
import PIL.Image
import io
import json
import torch
import numpy as np
from processing_image import Preprocess
from visualizing_image import SingleImageViz
from modeling_frcnn import GeneralizedRCNN
from utils import Config
import utils
import wget
import pickle
import os
import cv2
from copy import deepcopy
torch.cuda.is_available()
URL = "https://github.com/jacobgil/vit-explain/raw/main/examples/both.png"
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)
# run frcnn
images, sizes, scales_yx = image_preprocess(URL)
output_dict = frcnn(
images,
sizes,
scales_yx=scales_yx,
padding="max_detections",
max_detections=frcnn_cfg.max_detections,
return_tensors="pt",
)
# Very important that the boxes are normalized
normalized_boxes = output_dict.get("normalized_boxes")
features = output_dict.get("roi_features")
from transformers import LxmertTokenizer, LxmertForPreTraining
import torch
tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
text_sentence = "dog and cat are in the room and " + tokenizer.mask_token + " is laying on the ground"
inputs = tokenizer(text_sentence, return_token_type_ids=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
visual_feats = features
visual_attention_mask = torch.ones(features.shape[:-1], dtype=torch.long)
visual_pos=normalized_boxes
inputs.update(
{
"visual_feats": visual_feats,
"visual_pos": visual_pos,
"visual_attention_mask": visual_attention_mask,
}
)
model_outputs = model(**inputs, output_attentions=True)
model_outputs.keys()
Output:
odict_keys(['prediction_logits', 'cross_relationship_score', 'question_answering_score', 'language_attentions', 'vision_attentions', 'cross_encoder_attentions'])
P.S.: You can control the pertaining task heads via the configuration fields task_matched
, task_mask_lm
, task_obj_predict
, and task_qa
. I assume you are only interested in mask_lm
following your comment. That means you should initialize your model as follows:
from transformers import LxmertConfig, LxmertForPreTraining
config = LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased")
config.task_matched = False
config.task_obj_predict=False
config.task_qa= False
model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased", config=config)