I have prepared a small example code but It is throwing error. Can't solve the problem because it is supposed to work.
Also do you think are there any better approaches to calculate image similarity? I want to find similar cloth images. e.g. I will give an image of a coat and I want to find similar coats.
also would this code handle all dimensions of images and all types of images?
here the code
import torch
import torchvision.transforms as transforms
import urllib.request
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_ID = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_ID).to(device)
preprocess = CLIPProcessor.from_pretrained(model_ID)
# Define a function to load an image and preprocess it for CLIP
def load_and_preprocess_image(image_path):
# Load the image from the specified path
image = Image.open(image_path)
# Apply the CLIP preprocessing to the image
image = preprocess(image).unsqueeze(0).to(device)
# Return the preprocessed image
return image
# Load the two images and preprocess them for CLIP
image_a = load_and_preprocess_image('/content/a.png')
image_b = load_and_preprocess_image('/content/b.png')
# Calculate the embeddings for the images using the CLIP model
with torch.no_grad():
embedding_a = model.encode_image(image_a)
embedding_b = model.encode_image(image_b)
# Calculate the cosine similarity between the embeddings
similarity_score = torch.nn.functional.cosine_similarity(embedding_a, embedding_b)
# Print the similarity score
print('Similarity score:', similarity_score.item())
here the error message
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
[<ipython-input-24-e95a926e1bc8>](https://localhost:8080/#) in <module>
25
26 # Load the two images and preprocess them for CLIP
---> 27 image_a = load_and_preprocess_image('/content/a.png')
28 image_b = load_and_preprocess_image('/content/b.png')
29
3 frames
[/usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils_base.py](https://localhost:8080/#) in _call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2579
2580 if not _is_valid_text_input(text):
-> 2581 raise ValueError(
2582 "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2583 "or `List[List[str]]` (batch of pretokenized examples)."
ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)
I am not sure why this code is supposed to work since it contains several errors (CLIPModel
has no encode_image
. CLIPProcessor.__call__
's first argument expects text, the second argument is for images.) Please find the corrected code below:
import torch
from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
# Load the CLIP model
model_ID = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_ID)
preprocess = CLIPImageProcessor.from_pretrained(model_ID)
# Define a function to load an image and preprocess it for CLIP
def load_and_preprocess_image(image_path):
# Load the image from the specified path
image = Image.open(image_path)
# Apply the CLIP preprocessing to the image
image = preprocess(image, return_tensors="pt")
# Return the preprocessed image
return image
# Load the two images and preprocess them for CLIP
image_a = load_and_preprocess_image('/content/bla.png')["pixel_values"]
image_b = load_and_preprocess_image('/content/bla.png')["pixel_values"]
# Calculate the embeddings for the images using the CLIP model
with torch.no_grad():
embedding_a = model.get_image_features(image_a)
embedding_b = model.get_image_features(image_b)
# Calculate the cosine similarity between the embeddings
similarity_score = torch.nn.functional.cosine_similarity(embedding_a, embedding_b)
# Print the similarity score
print('Similarity score:', similarity_score.item())
Output:
Similarity score: 1.0000001192092896