I have a directory of images that contains many unidentified duplicates. My goal is to identify the duplicates. Because the duplicates have been cropped, resized, or converted to a different image format, they cannot be detected by comparing their hashes.
I wrote a script that successfully detects duplicates, but there is one major drawback: the script is slow. On a test-drive with a folder containing 60 items, it took five hours to run (this might also be a reflection of my increasingly buggy and slow computer). Since I have approximately 66,000 images in my directory, I estimate that it will take 229 days for the script to complete.
Can anyone suggest solutions? My research has revealed that you can free up memory by "releasing" the image stored in the variable as the loop completes, but all the information on how to do this seems to be written in C, not python. I was also thinking of trying to use orb instead of sift, but have concerns about its accuracy. Does anyone have advice on which of the two options would be better to pursue? Or a way to rewrite the script so it take less memory? Many thanks in advance.
from __future__ import division
import cv2
import numpy as np
import glob
import pandas as pd
listOfTitles1 = []
listOfTitles2 = []
listOfSimilarities = []
# Sift and Flann
sift = cv2.xfeatures2d.SIFT_create()
index_params = dict(algorithm=0, trees=5)
search_params = dict()
flann = cv2.FlannBasedMatcher(index_params, search_params)
# Load all the images1
countInner = 0
countOuter = 1
folder = r"/Downloads/images/**/*"
for a in glob.iglob(folder,recursive=True):
for b in glob.iglob(folder,recursive=True):
if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
if not b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
if b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countInner += 1
print(countInner, "", countOuter)
if countInner <= countOuter:
continue
image1 = cv2.imread(a)
kp_1, desc_1 = sift.detectAndCompute(image1, None)
image2 = cv2.imread(b)
kp_2, desc_2 = sift.detectAndCompute(image2, None)
matches = flann.knnMatch(desc_1, desc_2, k=2)
good_points = []
if good_points == 0:
continue
for m, n in matches:
if m.distance < 0.6*n.distance:
good_points.append(m)
number_keypoints = 0
if len(kp_1) >= len(kp_2):
number_keypoints = len(kp_1)
else:
number_keypoints = len(kp_2)
percentage_similarity = float(len(good_points)) / number_keypoints * 100
listOfSimilarities.append(str(int(percentage_similarity)))
listOfTitles2.append(b)
listOfTitles1.append(a)
countInner = 0
if a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countOuter += 1
zippedList = list(zip(listOfTitles1,listOfTitles2, listOfSimilarities))
print(zippedList)
dfObj = pd.DataFrame(zippedList, columns = ['Original', 'Title' , 'Similarity'])
dfObj.to_csv(r"/Downloads/images/DuplicateImages3.csv")
I ran your existing implementation on my computer, on 100 images. That code took 6 hours and 31 minutes to run. Then I changed the implementation as I had suggested in my comment to compute sift.detectAndCompute once only for each image, cache the results and use the cached results in the comparisons. This reduced the execution time on my computer on the same 100 image from 6 hours 31 minutes to 6 minutes and 29 seconds. I don't know if this will be fast enough for all of your images, but it is a significant reduction.
See my modified implementation below.
from __future__ import division
import cv2
import numpy as np
import glob
import pandas as pd
listOfTitles1 = []
listOfTitles2 = []
listOfSimilarities = []
# Sift and Flann
sift = cv2.xfeatures2d.SIFT_create()
index_params = dict(algorithm=0, trees=5)
search_params = dict()
flann = cv2.FlannBasedMatcher(index_params, search_params)
# Load all the images1
countInner = 0
countOuter = 1
folder = r"/Downloads/images/**/*"
folder = "SiftImages/*"
siftOut = {}
for a in glob.iglob(folder,recursive=True):
if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
image1 = cv2.imread(a)
kp_1, desc_1 = sift.detectAndCompute(image1, None)
siftOut[a]=(kp_1,desc_1)
for a in glob.iglob(folder,recursive=True):
if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
(kp_1,desc_1) = siftOut[a]
for b in glob.iglob(folder,recursive=True):
if not b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
continue
if b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countInner += 1
print(countInner, "", countOuter)
if countInner <= countOuter:
continue
#### image1 = cv2.imread(a)
#### kp_1, desc_1 = sift.detectAndCompute(image1, None)
####
#### image2 = cv2.imread(b)
#### kp_2, desc_2 = sift.detectAndCompute(image2, None)
(kp_2,desc_2) = siftOut[b]
matches = flann.knnMatch(desc_1, desc_2, k=2)
good_points = []
if good_points == 0:
continue
for m, n in matches:
if m.distance < 0.6*n.distance:
good_points.append(m)
number_keypoints = 0
if len(kp_1) >= len(kp_2):
number_keypoints = len(kp_1)
else:
number_keypoints = len(kp_2)
percentage_similarity = float(len(good_points)) / number_keypoints * 100
listOfSimilarities.append(str(int(percentage_similarity)))
listOfTitles2.append(b)
listOfTitles1.append(a)
countInner = 0
if a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
countOuter += 1
zippedList = list(zip(listOfTitles1,listOfTitles2, listOfSimilarities))
print(zippedList)
dfObj = pd.DataFrame(zippedList, columns = ['Original', 'Title' , 'Similarity'])
### dfObj.to_csv(r"/Downloads/images/DuplicateImages3.csv")
dfObj.to_csv(r"DuplicateImages3.2.csv")