pythonocrtesseractpython-tesseractimage-preprocessing

Incorrect digit detection using Tesseract OCR on video frames in Python


I'm trying to calculate the real time of video recording. I have a lot of videos, some of which were lost during transmission. All of them are in mp4 format. to get the duration, I recognize the time using pytesseract.image_to_string, but I get incorrect results.

My approach involves capturing the frames, preprocessing the images, and using Tesseract to extract the text. However, the digit recognition is often incorrect or inconsistent.

The image on which the code recognizes the numbers is incorrect:

the image on which the code recognizes the numbers is incorrect

The above image gives a similar answer the time is calculated from 'data2/20240619_191208_first_frame.jpg' 19:12:66

The full code looks like this:

def digit_detect(image):
    text = pytesseract.image_to_string(image, config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789:')
    return text

def resize_roi(image, x1 = 131, y1 = 11, x2 = 228, y2  = 32):
    roi = image[y1:y2, x1:x2]
    return roi

def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return binary

# def extract_time_from_image(image):

#     regions = [
#         (131, 11, 142, 31, '012'),         # Tens of hours (0-2)
#         (142, 11, 155, 31, '0123456789'),  # Units of hours (0-9)
#         (163, 11, 179, 31, '012345'),      # Tens of minutes (0-5)
#         (179, 11, 193, 31, '0123456789'),  # Units of minutes (0-9)
#         (202, 11, 215, 31, '012345'),      # Tens of seconds (0-5)
#         (215, 11, 226, 31, '0123456789')   # Units of seconds (0-9)
#     ]

#     digits = []

#     for (x1, y1, x2, y2, whitelist) in regions:

#         preprocess = preprocess_image(image)
      
#         resized_roi = resize_roi(preprocess, x1, y1, x2, y2)


#         custom_config = f'--psm 6 --oem 3 -c tessedit_char_whitelist={whitelist}'
#         digit = pytesseract.image_to_string(resized_roi, config=custom_config)
      
#         digits.append(digit)

#     return digits
    

folder_path = 'data/output_rec/rkbt/1' 
load_path = "data2"

if not os.path.isdir(folder_path):
    print(f"Error1")
    exit()

video_files = [f for f in os.listdir(folder_path) if f.endswith('.mp4')]

for video_file in video_files:
    video_path = os.path.join(folder_path, video_file)

    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error2")
        continue

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    ret, first_frame = cap.read()
    if not ret:
        print(f"Error3")
        cap.release()
        continue

    cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames - 1)

    ret, last_frame = cap.read()
    if not ret:
        print(f"Error3")
        cap.release()
        continue

    cap.release()

    first_frame = resize_roi(first_frame)
    last_frame = resize_roi(last_frame)

    first_frame = preprocess_image(first_frame)
    last_frame = preprocess_image(last_frame)

    # print(extract_time_from_image(first_frame))
    # print(extract_time_from_image(last_frame))

    first_frame_path = os.path.join(load_path, f"{os.path.splitext(video_file)[0]}_first_frame.jpg")
    last_frame_path = os.path.join(load_path, f"{os.path.splitext(video_file)[0]}_last_frame.jpg")

    print(f"the time is calculated from '{first_frame_path}'" , digit_detect(first_frame))
    print(f"the time is calculated from '{last_frame_path}'" , digit_detect(last_frame))

    cv2.imwrite(first_frame_path, first_frame)
    cv2.imwrite(last_frame_path, last_frame)

    print(f"Saved images with the first and last frames for '{video_file}'")


Solution

  • As mentioned in the comments that image is easily and correctly OCR-ed by Tesseract without any extra pre-processing:

    config = '--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789:'
    text = ocr.image_to_string(image, config=config).strip()
    

    As an alternative to Tesseract you could use the OpenCV dnn-inference engine to detect and recognise the text. It is required that you have on your machine the files containing the weights of the model(s) (see this tutorial for details and info)

    Here a sample example of text recognition:

    # initiali the dnn-model
    path_of_the_dnn_weights = "path_from_compiler/crnn_cs.onnx" # check link "tutorial" for download
    
    dnn_model = cv2.dnn_TextRecognitionModel(path_of_the_dnn_weights)
    dnn_model.setDecodeType("CTC-greedy")
    dnn_model.setVocabulary("0123456789:")
    dnn_model.setInputParams(scale=1/127.5, size=(100, 32), mean=(127.5, 127.5, 127.5), swapRB=True)
    
    # evaluate the model
    rec_result = dnn_model.recognize(image)
    print(rec_result)
    

    When working with frames (video) then DB18 it is a very fast DNN - text detection model. DB refers to the paper about Real-time scene text detection with differentiable binarization.