pythonimage-processinghuggingface-transformersbert-language-modelmultimodal

prediction logits using lxmert with hugging face library


how can we get the prediction logits in the lxmert model using hugging face library? It's fairly easy to get in visualbert, but I'm not able to get it with the lxmert model. In case of visualbert model, the keys I'm getting are :

['prediction_logits', 'seq_relationship_logits', 'attentions']

and with the help of lxmert mode, the keys are :

['language_output', 'vision_output', 'pooled_output', 'language_attentions', 'vision_attentions', 'cross_encoder_attentions']

Even though there's a mention of prediction logits in the documentation I am not able to get them, if someone can help that would be great.

EDIT : Link to colab notebook for lxmert.


Solution

  • Use LxmertForPreTraining instead of LxmertModel:

    ###Colab commands
    #pip install transformers
    #!git clone https://github.com/huggingface/transformers
    #cd transformers
    #cd examples/research_projects/lxmert
    #pip install wget
    
    from IPython.display import clear_output, Image, display
    import PIL.Image
    import io
    import json
    import torch
    import numpy as np
    from processing_image import Preprocess
    from visualizing_image import SingleImageViz
    from modeling_frcnn import GeneralizedRCNN
    from utils import Config
    import utils
    import wget
    import pickle
    import os
    import cv2
    from copy import deepcopy
    
    torch.cuda.is_available()
    
    URL = "https://github.com/jacobgil/vit-explain/raw/main/examples/both.png"
    
    frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
    frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
    image_preprocess = Preprocess(frcnn_cfg)
    
    # run frcnn
    images, sizes, scales_yx = image_preprocess(URL)
    output_dict = frcnn(
        images,
        sizes,
        scales_yx=scales_yx,
        padding="max_detections",
        max_detections=frcnn_cfg.max_detections,
        return_tensors="pt",
    )
    
    # Very important that the boxes are normalized
    normalized_boxes = output_dict.get("normalized_boxes")
    features = output_dict.get("roi_features")
    
    from transformers import LxmertTokenizer, LxmertForPreTraining
    import torch
    
    tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
    model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
    
    text_sentence = "dog and cat are in the room and " + tokenizer.mask_token + " is laying on the ground"
    
    inputs = tokenizer(text_sentence, return_token_type_ids=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
    
    visual_feats = features
    visual_attention_mask = torch.ones(features.shape[:-1], dtype=torch.long)
    visual_pos=normalized_boxes
    
    inputs.update(
        {
            "visual_feats": visual_feats,
            "visual_pos": visual_pos,
            "visual_attention_mask": visual_attention_mask,
        }
    )
    
    model_outputs = model(**inputs, output_attentions=True)
    
    model_outputs.keys()
    

    Output:

    odict_keys(['prediction_logits', 'cross_relationship_score', 'question_answering_score', 'language_attentions', 'vision_attentions', 'cross_encoder_attentions'])
    

    P.S.: You can control the pertaining task heads via the configuration fields task_matched, task_mask_lm, task_obj_predict, and task_qa. I assume you are only interested in mask_lm following your comment. That means you should initialize your model as follows:

    from transformers import LxmertConfig, LxmertForPreTraining
    
    config = LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased")
    config.task_matched = False
    config.task_obj_predict=False
    config.task_qa= False
    model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased", config=config)