I'm trying to calculate the real time of video recording. I have a lot of videos, some of which were lost during transmission. All of them are in mp4 format. to get the duration, I recognize the time using pytesseract.image_to_string
, but I get incorrect results.
My approach involves capturing the frames, preprocessing the images, and using Tesseract to extract the text. However, the digit recognition is often incorrect or inconsistent.
The image on which the code recognizes the numbers is incorrect:
The above image gives a similar answer
the time is calculated from 'data2/20240619_191208_first_frame.jpg' 19:12:66
The full code looks like this:
def digit_detect(image):
text = pytesseract.image_to_string(image, config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789:')
return text
def resize_roi(image, x1 = 131, y1 = 11, x2 = 228, y2 = 32):
roi = image[y1:y2, x1:x2]
return roi
def preprocess_image(image):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binary
# def extract_time_from_image(image):
# regions = [
# (131, 11, 142, 31, '012'), # Tens of hours (0-2)
# (142, 11, 155, 31, '0123456789'), # Units of hours (0-9)
# (163, 11, 179, 31, '012345'), # Tens of minutes (0-5)
# (179, 11, 193, 31, '0123456789'), # Units of minutes (0-9)
# (202, 11, 215, 31, '012345'), # Tens of seconds (0-5)
# (215, 11, 226, 31, '0123456789') # Units of seconds (0-9)
# ]
# digits = []
# for (x1, y1, x2, y2, whitelist) in regions:
# preprocess = preprocess_image(image)
# resized_roi = resize_roi(preprocess, x1, y1, x2, y2)
# custom_config = f'--psm 6 --oem 3 -c tessedit_char_whitelist={whitelist}'
# digit = pytesseract.image_to_string(resized_roi, config=custom_config)
# digits.append(digit)
# return digits
folder_path = 'data/output_rec/rkbt/1'
load_path = "data2"
if not os.path.isdir(folder_path):
print(f"Error1")
exit()
video_files = [f for f in os.listdir(folder_path) if f.endswith('.mp4')]
for video_file in video_files:
video_path = os.path.join(folder_path, video_file)
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error2")
continue
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
ret, first_frame = cap.read()
if not ret:
print(f"Error3")
cap.release()
continue
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames - 1)
ret, last_frame = cap.read()
if not ret:
print(f"Error3")
cap.release()
continue
cap.release()
first_frame = resize_roi(first_frame)
last_frame = resize_roi(last_frame)
first_frame = preprocess_image(first_frame)
last_frame = preprocess_image(last_frame)
# print(extract_time_from_image(first_frame))
# print(extract_time_from_image(last_frame))
first_frame_path = os.path.join(load_path, f"{os.path.splitext(video_file)[0]}_first_frame.jpg")
last_frame_path = os.path.join(load_path, f"{os.path.splitext(video_file)[0]}_last_frame.jpg")
print(f"the time is calculated from '{first_frame_path}'" , digit_detect(first_frame))
print(f"the time is calculated from '{last_frame_path}'" , digit_detect(last_frame))
cv2.imwrite(first_frame_path, first_frame)
cv2.imwrite(last_frame_path, last_frame)
print(f"Saved images with the first and last frames for '{video_file}'")
pytesseract.image_to_string
As mentioned in the comments that image is easily and correctly OCR-ed by Tesseract without any extra pre-processing:
config = '--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789:'
text = ocr.image_to_string(image, config=config).strip()
As an alternative to Tesseract you could use the OpenCV dnn-inference engine to detect and recognise the text. It is required that you have on your machine the files containing the weights of the model(s) (see this tutorial for details and info)
Here a sample example of text recognition:
# initiali the dnn-model
path_of_the_dnn_weights = "path_from_compiler/crnn_cs.onnx" # check link "tutorial" for download
dnn_model = cv2.dnn_TextRecognitionModel(path_of_the_dnn_weights)
dnn_model.setDecodeType("CTC-greedy")
dnn_model.setVocabulary("0123456789:")
dnn_model.setInputParams(scale=1/127.5, size=(100, 32), mean=(127.5, 127.5, 127.5), swapRB=True)
# evaluate the model
rec_result = dnn_model.recognize(image)
print(rec_result)
When working with frames (video) then DB18 it is a very fast DNN - text detection model. DB refers to the paper about Real-time scene text detection with differentiable binarization.