pythoncomputer-visionocrtesseractcaptcha

Remove noise and lines from the CAPTCHA image


I need support to treat a captcha image, I made an algorithm for testing to treat some images that come from a website's captcha and for some types I was able to treat and use tesseract to extract the text successfully! However, there is one of the images that follows a different pattern and I am not able to evolve much, I need to remove these diagonal lines so that I can get the text, follow my code below and the images before and after the treatment. Do you have any ideas on how to help me deal with this stage of these noises that still appear?

import pytesseract
import cv2
from PIL import Image
import numpy as np

def limpar_imagem(imagem: Image.Image):
    imagem = imagem.convert('RGB')
    largura, altura = imagem.size
    array = np.array(imagem)

    # Converte para tons inteiros para facilitar comparação
    def rgb_to_hex_val(pixel):
        return (pixel[0] << 16) + (pixel[1] << 8) + pixel[2]

    # Define limites
    escuro_min = 0x000000
    escuro_max = 0x999999
    branco1 = 0xFEFEFE
    branco2 = 0xFAFAFA
    preto = 0x000000

    resultado = array.copy()

    for x in range(1, largura - 1):  # evita bordas
        for y in range(altura):
            cor = rgb_to_hex_val(array[y, x])
            if (escuro_min <= cor <= escuro_max) or (cor in [branco1, branco2, preto]):
                viz_esq = rgb_to_hex_val(array[y, x - 1])
                viz_dir = rgb_to_hex_val(array[y, x + 1])
                if escuro_min <= viz_esq <= escuro_max and escuro_min <= viz_dir <= escuro_max:
                    resultado[y, x] = array[y, x]

    return Image.fromarray(resultado)

# Caminho para o Tesseract no Windows
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

metodos = [
    cv2.THRESH_BINARY,
    cv2.THRESH_BINARY_INV,
    cv2.THRESH_TRUNC,
    cv2.THRESH_TOZERO,
    cv2.THRESH_TOZERO_INV,
]

imagem = cv2.imread('image.png')

# transformar a imagem em escala de cinza
imagem_cinza = cv2.cvtColor(imagem, cv2.COLOR_RGB2GRAY)

for index,metodo in enumerate(metodos):
    _, imagem_tratada = cv2.threshold(imagem_cinza, 127, 255, metodo or cv2.THRESH_OTSU)

    cv2.imwrite(f'imagem_tratada_{index}.png', imagem_tratada)

# Abre a imagem tratada
imagem = Image.open("imagem_tratada_2.png")

imagem_array = np.array(imagem)

# Define os tons que consideras como "cinza"
limiar_inferior = 100
limiar_superior = 255

# Cria uma máscara onde o pixel está dentro do intervalo de cinza
mascara_cinza = (imagem_array >= limiar_inferior) & (imagem_array <= limiar_superior)

# Define esses pixels como brancos
imagem_array[mascara_cinza] = 255
imagem_array[0:6, :] = 255

limiar_preto = 30  # Pode ajustar

# Máscara dos pixels pretos na imagem original
mascara_preto = imagem_array <= limiar_preto

# Deixa ainda mais preto (define como 0)
imagem_array[mascara_preto] = 0

# Converte de volta para imagem e salva
imagem_resultado = Image.fromarray(imagem_array)
imagem_resultado.save('teste/imagemFinal.png')

imagem = Image.open('teste/imagemFinal.png')

img = limpar_imagem(imagem)
img.save("saida_2.png")

texto = pytesseract.image_to_string(Image.open("saida_2.png"), config='--psm 7')
print("Texto reconhecido:", texto)

image to manipulate:

Image origin

image result:

Result image


Solution

  • Focusing on the problem presented, which is removing repeating patterns from an image, this can be solved by cycling through all the rows or columns and looking for repeating pixel values. I have a sample implementation here which works well on a grayscale image, but could be adapted to work on multi-channel images if necessary.

    testimage.png is a direct download of your original image above.

    import PIL.Image
    import numpy as np
    import matplotlib.pyplot as plt
    
    def get_max_channel_img(img: np.ndarray | PIL.Image.Image) -> np.ndarray:
        if type(img) is PIL.Image.Image:
            img = np.array(img)
        return np.max(img, axis=2)
    
    def get_gray_img(img: np.ndarray | PIL.Image.Image) -> np.ndarray:
        if type(img) is PIL.Image.Image:
            return np.array(img.convert('L'))
        return np.linalg.norm(img, axis=2)
    
    def filter_repeats(img: np.ndarray) -> np.ndarray:
        # Whats the minimum number of times the pattern repeats itself
        MIN_REPEATS = 8
        # Whats the minimum period in pixels of the pattern
        MIN_STEP = 2
        # Whats the "closeness" required to be considered similar
        CLOSENESS_RATIO = 0.2
        # What percent of the pixels have to match in order to filter it out
        MIN_THRESHOLD = 0.95
        # How many periods need to be checked
        total_periods = img.shape[1] // MIN_REPEATS
    
        # This is the altered array that will be returned
        out_array = img.copy()
        # Cycle through each row in the image
        for row_num in range(img.shape[0]):
            # Cycle through each period
            for period_val in range(MIN_STEP, total_periods):
                # Check for matches at every possible offset
                #   (The matches may not start in the first pixel)
                for offset in range(period_val):
                    # What are the columns of the pixels to check
                    col_values = [
                        offset + period_val * x
                        for x in range(img.shape[1] // period_val + 1)
                    ]
                    # A +1 was added above to capture the last pixel
                    #   which is sometimes missed otherwise.
                    #   But that means that sometimes the last position exceeds img dimensions
                    if col_values[-1] >= img.shape[1]:
                        col_values = col_values[:-1]
                    # Grab each pixel's value
                    values = [int(img[row_num, i]) for i in col_values]
                    # Calculate the average of all the pixels
                    avg_value = np.average(values)
                    # Compare each pixel to that average
                    deviations = [
                        float(abs(x - avg_value)) / 255 < CLOSENESS_RATIO
                        for x in values
                    ]
                    # Check to see if enough of them meet the closeness
                    if np.average(deviations) > MIN_THRESHOLD:
                        # If they met it, replace each pixel with white
                        for i in col_values:
                            out_array[row_num, i] = 255
        return out_array
    
    def _main():
        fpath = 'testimage.png'
        image = PIL.Image.open(fpath)
    
        colour_image2 = get_max_channel_img(image)
        colour_image3 = filter_repeats(colour_image2)
        colour_image4 = colour_image3.copy()
        colour_image4[colour_image4 > 230] = 255
    
        fig, axarr = plt.subplots(3, 1)
    
        axarr[0].imshow(colour_image2, cmap='gray', vmin=0, vmax=255)
        axarr[1].imshow(colour_image3, cmap='gray', vmin=0, vmax=255)
        axarr[2].imshow(colour_image4, cmap='gray', vmin=0, vmax=255)
        plt.savefig('OUTPUT.PNG')
        plt.show()
    
    if __name__ == '__main__':
        _main()
    

    Here is the plot it produces:

    plot showing original image, filtered by patterns image and filtered by threshold image

    The top image is the original but grayscaled, the middle one is filtered for patterns and the bottom one is filtered for patterns then with a basic threshold which removed a bunch of near-white pixels scattered throughout the image, leaving behind a fairly pristine view of the letters.

    The variables with all caps names are just general values I grabbed without much testing. You should change these to work well with your other images, these general values just worked pretty well on the sample you provided, if they are all like that then these values will probably be fine.

    In the example code, I showed only a row based filtering since your image is much wider than it is tall, but if you need column based filtering that function could be adjusted.

    Let me know if you have any questions.