I'm on macOS using python 3.10
I have this code which I got and slightly changed from another post,
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
rsrcmgr, laparams = PDFResourceManager(), LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
fp = open("my_pdf", 'rb')
pages = PDFPage.get_pages(fp)
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
print("It worked")
However, when I use it on some pdfs it gives me this error:
Traceback (most recent call last):
File "MY_DIRECTORY/create_database.py", line 38, in <module>
interpreter.process_page(page)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 991, in process_page
self.render_contents(page.resources, page.contents, ctm=ctm)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 1010, in render_contents
self.execute(list_value(streams))
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 1036, in execute
func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 966, in do_Do
interpreter.render_contents(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 1010, in render_contents
self.execute(list_value(streams))
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 1036, in execute
func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 903, in do_Tj
self.do_TJ([s])
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfinterp.py", line 896, in do_TJ
self.device.render_string(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfdevice.py", line 133, in render_string
textstate.linematrix = self.render_string_horizontal(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdfdevice.py", line 170, in render_string_horizontal
for cid in font.decode(obj):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/pdffont.py", line 1174, in decode
return self.cmap.decode(bytes)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pdfminer/cmapdb.py", line 136, in decode
return struct.unpack(">%dH" % n, code)
struct.error: unpack requires a buffer of 6 bytes
Is it a problem with my code, the library pdfminer.six, or a problem with some pdfs? And how can I fix it?
I SOLVED IT, for some reason this part of the code:
rsrcmgr, laparams = PDFResourceManager(), LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
has to be in between these:
fp = open("my_pdf", 'rb')
pages = PDFPage.get_pages(fp)
So the final code looks like this:
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
fp = open("my_pdf", 'rb')
rsrcmgr, laparams = PDFResourceManager(), LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
print("It worked")
If anyone knows why, could you please answer this post, I'd be glad to learn