pythonpdfocrpdf-extraction

PDF to text in Python returning empty results in image files


I've got this scanned pdf file. Image based low resolution pdf file. I'm trying to extract the data in it and all options I've tried seem not to work.

Option 1 - using pdfminer

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

Option 2 - using tika

from tika import parser # pip install tika
raw = parser.from_file(path)
text=raw['content']
# I don't like to use it very much because it often corrupts the file

Option 3 - using pypdf

    import PyPDF2
    pdf_file = open(path, 'rb')
    read_pdf = PyPDF2.PdfFileReader(pdf_file)
    number_of_pages = read_pdf.getNumPages()
    page = read_pdf.getPage(0)
    page_content = page.extractText()
    text=page_content.encode('utf-8')

All the options return empty results. I suppose it might be related to the quality of the file. I know we could work on images and increase image characteristics to ease the data extraction (increase image size, work on thresholds, etc, you can do a lot of stuff with PIL). Is there an efficient way to also do that with pdf files?


Solution

  • In the end I came up with a solution which is not ideal but worked for me using pdfminer and pytesseract:

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import StringIO
    
    def convert_pdf_image_to_text(file_path):
        from pdf2image import convert_from_path
        import pytesseract
    
        dpi = 350 # dots per inch
        pages = convert_from_path(file_path ,dpi)
        text=""
    
        for i in range(len(pages)):
            page = pages[i]
            a=pytesseract.image_to_string(page)
            text=text+a
    
        return text
    
    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
    
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
    
        text = retstr.getvalue()
    
        fp.close()
        device.close()
        retstr.close()
    
    # extracting data from image pdfs
    
    if "a" not in text or "A" not in text and extract_image_pdfs==True:
        # my pdfs will always have an "a" that's why I use this if sentence above
        try:
            print('starting to convert to image')
            text=convert_pdf_image_to_text(path)
            print('finished converting to image')
        except:
            text="no text"
            print("not pdf nor image")
    
    return text