pythonopencvsiftorb

Is there a way to run OpenCV's SIFT faster?


I have a directory of images that contains many unidentified duplicates. My goal is to identify the duplicates. Because the duplicates have been cropped, resized, or converted to a different image format, they cannot be detected by comparing their hashes.

I wrote a script that successfully detects duplicates, but there is one major drawback: the script is slow. On a test-drive with a folder containing 60 items, it took five hours to run (this might also be a reflection of my increasingly buggy and slow computer). Since I have approximately 66,000 images in my directory, I estimate that it will take 229 days for the script to complete.

Can anyone suggest solutions? My research has revealed that you can free up memory by "releasing" the image stored in the variable as the loop completes, but all the information on how to do this seems to be written in C, not python. I was also thinking of trying to use orb instead of sift, but have concerns about its accuracy. Does anyone have advice on which of the two options would be better to pursue? Or a way to rewrite the script so it take less memory? Many thanks in advance.

from __future__ import division

import cv2
import numpy as np
import glob
import pandas as pd
   

listOfTitles1 = []
listOfTitles2 = []
listOfSimilarities = []
    
    # Sift and Flann
sift = cv2.xfeatures2d.SIFT_create()


index_params = dict(algorithm=0, trees=5)
search_params = dict()
flann = cv2.FlannBasedMatcher(index_params, search_params)

# Load all the images1

countInner = 0
countOuter = 1

folder = r"/Downloads/images/**/*"

for a in glob.iglob(folder,recursive=True):
    for b in glob.iglob(folder,recursive=True):
    
        if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):

            continue

        if not b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):

            continue

        if b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):

            countInner += 1
        
        print(countInner, "", countOuter)
    
        if countInner <= countOuter:

            continue

        image1 = cv2.imread(a)
        kp_1, desc_1 = sift.detectAndCompute(image1, None)
    
        image2 = cv2.imread(b)
        kp_2, desc_2 = sift.detectAndCompute(image2, None)

        matches = flann.knnMatch(desc_1, desc_2, k=2)

        good_points = []

        if good_points == 0:

            continue

        for m, n in matches:
            if m.distance < 0.6*n.distance:
                good_points.append(m)

        number_keypoints = 0
        if len(kp_1) >= len(kp_2):
            number_keypoints = len(kp_1)
        else:
            number_keypoints = len(kp_2)
            
        percentage_similarity = float(len(good_points)) / number_keypoints * 100

        listOfSimilarities.append(str(int(percentage_similarity)))
        listOfTitles2.append(b)

        listOfTitles1.append(a)
        
    countInner = 0
    if a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
        countOuter += 1

zippedList =  list(zip(listOfTitles1,listOfTitles2, listOfSimilarities))

print(zippedList)

dfObj = pd.DataFrame(zippedList, columns = ['Original', 'Title' , 'Similarity'])

dfObj.to_csv(r"/Downloads/images/DuplicateImages3.csv")

Solution

  • I ran your existing implementation on my computer, on 100 images. That code took 6 hours and 31 minutes to run. Then I changed the implementation as I had suggested in my comment to compute sift.detectAndCompute once only for each image, cache the results and use the cached results in the comparisons. This reduced the execution time on my computer on the same 100 image from 6 hours 31 minutes to 6 minutes and 29 seconds. I don't know if this will be fast enough for all of your images, but it is a significant reduction.

    See my modified implementation below.

    from __future__ import division
    
    import cv2
    import numpy as np
    import glob
    import pandas as pd
    
    
    listOfTitles1 = []
    listOfTitles2 = []
    listOfSimilarities = []
    
        # Sift and Flann
    sift = cv2.xfeatures2d.SIFT_create()
    
    
    index_params = dict(algorithm=0, trees=5)
    search_params = dict()
    flann = cv2.FlannBasedMatcher(index_params, search_params)
    
    # Load all the images1
    
    countInner = 0
    countOuter = 1
    
    folder = r"/Downloads/images/**/*"
    folder = "SiftImages/*"
    
    
    siftOut = {}
    for a in glob.iglob(folder,recursive=True):
        if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
            continue
        image1 = cv2.imread(a)
        kp_1, desc_1 = sift.detectAndCompute(image1, None)
        siftOut[a]=(kp_1,desc_1)
    
    
    
    for a in glob.iglob(folder,recursive=True):
        if not a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
            continue
    
        (kp_1,desc_1) = siftOut[a]
    
        for b in glob.iglob(folder,recursive=True):
    
    
            if not b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
    
                continue
    
            if b.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
    
                countInner += 1
    
    
            print(countInner, "", countOuter)
    
            if countInner <= countOuter:
    
                continue
    
            #### image1 = cv2.imread(a)
            #### kp_1, desc_1 = sift.detectAndCompute(image1, None)
            ####
            #### image2 = cv2.imread(b)
            #### kp_2, desc_2 = sift.detectAndCompute(image2, None)
    
            (kp_2,desc_2) = siftOut[b]
    
            matches = flann.knnMatch(desc_1, desc_2, k=2)
    
            good_points = []
    
            if good_points == 0:
    
                continue
    
            for m, n in matches:
                if m.distance < 0.6*n.distance:
                    good_points.append(m)
    
            number_keypoints = 0
            if len(kp_1) >= len(kp_2):
                number_keypoints = len(kp_1)
            else:
                number_keypoints = len(kp_2)
    
            percentage_similarity = float(len(good_points)) / number_keypoints * 100
    
            listOfSimilarities.append(str(int(percentage_similarity)))
            listOfTitles2.append(b)
    
            listOfTitles1.append(a)
    
        countInner = 0
        if a.lower().endswith(('.jpg','.png','.tif','.tiff','.gif')):
            countOuter += 1
    
    zippedList =  list(zip(listOfTitles1,listOfTitles2, listOfSimilarities))
    
    print(zippedList)
    
    dfObj = pd.DataFrame(zippedList, columns = ['Original', 'Title' , 'Similarity'])
    
    ### dfObj.to_csv(r"/Downloads/images/DuplicateImages3.csv")
    dfObj.to_csv(r"DuplicateImages3.2.csv")