I've done OCR on a document and have a list of information about each object found: x0, y0, x1 & y1 coordinates that create a bounding box for the word and the word itself.
How the OCR library works, is that it returns the items in "reading order" from top left to bottom right based on "what the creator of the document intended" - sometimes this is not the natural reading order.
For example sometimes text that belongs in the same block are recognized as separate, which causes the order to be wrong. In the below image, the list of events should be a list of years and events, in order ["1999", "Item 1", "2000", "Item 2", ...]
. Instead OCR will think of these two columns as separate text boxes ["1999", "2000", "2000", "2001", ..., "Item 1", "Item 2", "Item 3", ...]
and the logical order of the document is lost.
The library has a sorting algorithm, but unfortunately it does not do well when the document is skewed - it will assume that bboxes higher up in the document must always be earlier in order. So if the document is skewed counter-clockwise, e.g. the last word in a line of words will be higher than the first word and will jump earlier in the order.
I'm trying to figure out a way to order these bounding boxes based on the true natural reading order. What I've tried so far is calculating the mid point of the document and each bounding box, and calculate distances from each word to the midpoint of the document, and use that to create a distance metric that could be used for sorting, but not much luck yet. Also it feels like I may be overcomplicating things. Another approach I've thought about is trying to deduce "lines" of text from the bbox with some threshold to account for skewed text, and then when each bbox can be placed on a line they can simple be sorted by their x-axis location from left to right. This is not implemented yet, just a thought at this point.
The way the code is written I do not have the option of deskewing the document before parsing it, so I just need to be able to infer from the bounding boxes.
Here's what I have so far.
def read_pdf(source_path: str, **kw) -> dict:
pages = fitz.open(source_path)
data = {}
for page_number, page in enumerate(tqdm(pages), start=1):
text_blocks = page.get_text('words')
data[page_number] = text_blocks
return data
def calculate_distances_to_order_bboxes(document: list) -> dict:
min_x, min_y = min([min(bbox[0], bbox[2]) for bbox in document]), min([min(bbox[1], bbox[3]) for bbox in document])
max_x, max_y = max([max(bbox[0], bbox[2]) for bbox in document]), max([max(bbox[1], bbox[3]) for bbox in document])
document_mid_point_x, document_mid_point_y = (min_x + max_x) / 2, (min_y + max_y) / 2
calculations_dict = {}
enum_id = 0
multiplier = 1
for x0, y0, x1, y1, word, block_no, line_no, word_no in document:
bbox_mid_x, bbox_mid_y = (x0 + x1) / 2, (y0 + y1) / 2
dist_to_mid_x, dist_to_mid_y = document_mid_point_x - bbox_mid_x, document_mid_point_y - bbox_mid_y
if dist_to_mid_x < 0:
multiplier += 0.2
if dist_to_mid_y < 0:
multiplier += 1
dist_to_mid = (sqrt(dist_to_mid_x**2 + dist_to_mid_y**2)) * multiplier
calculations_dict[enum_id] = {
'word': word, 'x0': x0, 'y0': y0, 'x1': x1, 'y1': y1, 'bbox_mid_x': bbox_mid_x, 'bbox_mid_y': bbox_mid_y,
'dist_to_mid_x': dist_to_mid_x, 'dist_to_mid_y': dist_to_mid_y, 'dist_to_mid': dist_to_mid
}
enum_id += 1
return calculations_dict
import fitz
from tqdm import tqdm
from math import sqrt
data = read_pdf(source_path_to_pdf)
document = data[0]
calculations_dict = calculate_distances_to_order_bboxes(document)
So I figured out a simpler approach, using pandas which I'm more familiar with. This function will take the raw data (data
, in the above example, with all of the pages all at once), turn it into a pandas dataframe and then it does different operations like sorting based on page number, y-location and finally x-location, calculating thresholds based on the height of the bbox to determine whether another bbox is likely to be on the same line or not. For a document of 20 pages the sorting takes 0.05 seconds.
def calculate_bbox_distances(df: pd.DataFrame, first_pass: bool=False) -> pd.DataFrame:
'''
Sorts dataframe and calculates distances between rows.
For first pass, finds rows which are likely lines just before a line break (distance to next line is above bbox_threshold),
creates a new column `new_y0` with the value of `y0`, which leaves all the other rows empty for this column.
Then takes advantage of `bfill()` to fill backwards the values of y0, so that they end on the same line.
'''
df = df.sort_values(['page_num', 'y0', 'x0'])
df['prev_y0'], df['next_y0'] = df.y0.shift(1), df.y0.shift(-1)
df['prev_page_num'], df['next_page_num'] = df.page_num.shift(1), df.page_num.shift(-1)
df.loc[df.page_num != df.prev_page_num, 'prev_y0'] = df.loc[df.page_num != df.prev_page_num, 'y0']
df['dist_to_prev_y0'], df['dist_to_next_y0'] = abs(df.prev_y0 - df.y0), abs(df.next_y0 - df.y0)
if first_pass:
row_before_new_line = (df.dist_to_next_y0 > df.bbox_threshold)
df.loc[row_before_new_line, 'new_y0'] = df.y0
df['y0'] = df.new_y0.bfill()
return df
def get_ordered_words_dict(data: dict) -> dict:
'''
Data is a dictionary where the key is the page number and value is what PyMuPDF returns with page.get_text('words')
'''
dict_list = []
for page_num, document in data.items():
for word_id, (x0, y0, x1, y1, word, block_no, line_no, word_no) in enumerate(document, start=0):
bbox_dict = {'page_num': page_num, 'word_id': word_id, 'word': word, 'x0': x0, 'y0': y0, 'x1': x1, 'y1': y1}
dict_list.append(bbox_dict)
df = pd.DataFrame(dict_list)
df['bbox_height'] = df.y1 - df.y0
df['bbox_threshold'] = df.bbox_height * 0.4
df = calculate_bbox_distances(df, True)
df = calculate_bbox_distances(df)
df.dist_to_next_y0.loc[df.dist_to_next_y0 <= df.bbox_threshold] = 0
df['word_with_splitter'] = df.word + df.dist_to_next_y0.apply(lambda x: ' ' if x == 0 else '\n')
words_dict = df.groupby('page_num')['word_with_splitter'].sum().to_dict()
return words_dict