I am iterating over a bunch of pdf in a folder, parse their content and append it to a list. It works on a subset of pdf-files. I dont want to manually remove some of the pdf, run the code and then add a few to run it again until i found the malfunctioning pdfs. Since some pdf cannot be opened or may have corrupted content, i did the following to ensure the loop runs through: check_extractable
(pdfminer should throw an error if a pdf is not extractable) is the method of an inhouse class (PDFTextExtractionNotAllowed) that can prevent it from trying to open pdf it actually cannot
Question: What do i need to do in order to make the code keep running even if there is a pdf that cannot be opened or has not content (assuming this is why the error is thrown at that specific point in the code)
import pdfminer
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
import os
import io
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTFigure, LTImage,
LTTextLine, LTTextContainer, LTChar, LTTextBoxHorizontal
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
directory = 'C:/Users/'
data = []
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
fake_file_handle = io.StringIO()
with open(os.path.join(directory, file), 'rb') as fh:
resource_manager = PDFResourceManager()
laparams = LAParams(line_margin = 0.6)
device = PDFPageAggregator(resource_manager, laparams = laparams)
page_interpreter = PDFPageInterpreter(resource_manager, device)
positions = []
raw_text = []
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextContainer) or isinstance(lobj, LTTextBox) or isinstance(lobj, pdfminer.layout.LTTextBoxHorizontal):
coord, word = int(lobj.bbox[1]), lobj.get_text().strip()
raw_text.append([coord, word])
for text_line in lobj:
for character in text_line:
if isinstance(character, LTChar):
if character.matrix[0]>0 :
position = character.bbox
positions.append(position)
# if it's a container, recurse
elif isinstance(lobj, LTFigure):
pass
# extract elements below y0=781 und above y0=57
text_pos = []
maxFontpos = 780
minFontpos = 58
for coord, word in raw_text:
if coord <= maxFontpos and coord >= minFontpos:
text_pos.append(word)
else:
pass
try:
wap = text_pos[0]
except:
pass
data.append([text_pos, wap])
fake_file_handle.close()
The specific error is thrown at
---> 28 for character in text_line:
29 if isinstance(character, LTChar):
30 if character.matrix[0]>0 :
TypeError: 'LTChar' object is not iterable
If this is just a quick and dirty script, I would recommend just surrounding the entire with
block in a general try
/except
. Typically you don't want to just blindly except/catch exceptions without specifying what type you are looking for in case a different exception/error occurs that you were not expecting, but in a situation like this I think it would be okay:
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
directory = 'C:/Users/'
data = []
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
fake_file_handle = io.StringIO()
try:
with open(os.path.join(directory, file), 'rb') as fh:
resource_manager = PDFResourceManager()
laparams = LAParams(line_margin = 0.6)
device = PDFPageAggregator(resource_manager, laparams = laparams)
page_interpreter = PDFPageInterpreter(resource_manager, device)
positions = []
raw_text = []
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextContainer) or isinstance(lobj, LTTextBox) or isinstance(lobj, pdfminer.layout.LTTextBoxHorizontal):
coord, word = int(lobj.bbox[1]), lobj.get_text().strip()
raw_text.append([coord, word])
for text_line in lobj:
for character in text_line:
if isinstance(character, LTChar):
if character.matrix[0]>0 :
position = character.bbox # font-positon
positions.append(position)
# if it's a container, recurse
elif isinstance(lobj, LTFigure):
pass
# extract elements below y0=781 und above y0=57
text_pos = []
maxFontpos = 780
minFontpos = 58
for coord, word in raw_text:
if coord <= maxFontpos and coord >= minFontpos:
text_pos.append(word)
else:
pass
try:
wap = text_pos[0]
except:
pass
except:
continue # Move on to next loop iteration
data.append([text_pos, wap])
fake_file_handle.close()