I am trying to de-compose CLIP's text_model from huggingface but I'm running into some issues I don't understand.
In particular, as far as I understand calling CLIP.text_model should be the same as:
but when I try to compare the outputs I get different values for the two approaches.
here is my code so far:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model = model.to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def decomposed_text_model(text, processor, model, device):
inputs = processor(text=text, return_tensors="pt", padding=True)
attn_mask = inputs["attention_mask"].clone().detach().to(torch.bool).to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}
embeddings = model.text_model.embeddings(inputs["input_ids"])
position_embeddings=model.text_model.embeddings.position_embedding.weight[:inputs['input_ids'].shape[1]]
embeddings = embeddings + position_embeddings.unsqueeze(0)
encoder_output = model.text_model.encoder(
inputs_embeds=embeddings,
attention_mask=attn_mask).last_hidden_state
embeddings = model.text_model.final_layer_norm(encoder_output)
return embeddings
def text_model(text, processor, model):
inputs = processor(text="a photo of a cat", return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
return model.text_model(**inputs)
# two step text approach
out1 = decomposed_text_model("a photo of a cat", processor, model)
out1 = out1.last_hidden_state[0, -1, :] # get eos token
out1 = out1.squeeze()
# one step text approach
out2 = text_model("a photo of a cat", processor, model)
out2 = out2.last_hidden_state[0, -1, :] # get eos token
out2 = out2.squeeze()
# compare
out1 = out1 / out1.norm(p = 2, dim=-1, keepdim=True)
out2 = out2 / out2.norm(p = 2, dim=-1, keepdim=True)
diff = torch.max(torch.abs(out1 - out2))
print(diff)
with diff being a somewhat high number (more fine-grained logging also revealed significant differences between the two eos tensors).
What am I missing? Please understand that this approach is necessary to implement something, so I cannot just call text_model.
There are a few errors in your code, but the most important step you forgot to reproduce the logic of the CLIP text model is the use of the 4d-causualmask. You can find the relevant code here and your code should look as follows:
import torch
from transformers import CLIPModel, CLIPTokenizerFast
from transformers.modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
device = "cuda" if torch.cuda.is_available() else "cpu"
m = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
m = m.to(device)
t = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
text = "a photo of a cat"
inputs = t(text, return_tensors="pt")
inputs.to(device)
@torch.no_grad()
def decomposed_text_model(inputs, model):
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
input_shape = input_ids.size()
hidden_states = model.text_model.embeddings(input_ids=input_ids)
causal_attention_mask = _create_4d_causal_attention_mask(
input_shape, hidden_states.dtype, device=hidden_states.device
)
attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
encoder_output = model.text_model.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask
).last_hidden_state
embeddings = model.text_model.final_layer_norm(encoder_output)
return embeddings
@torch.no_grad()
def text_model(inputs, model):
return model.text_model(**inputs)
# two step text approach
out1 = decomposed_text_model(inputs, m)
out1 = out1[0, -1, :] # get eos token
out1 = out1.squeeze()
# one step text approach
out2 = text_model(inputs, m)
out2 = out2.last_hidden_state[0, -1, :] # get eos token
out2 = out2.squeeze()
# compare
print(torch.allclose(out1, out2))
Output:
True