I have written python code that scrapes all the data from the PDF file. The problem here is that once it is scraped,the words lose their grammer. How to fix these problem? I am attaching the code.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp, pagenos, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
return text
print convert_pdf_to_txt("S24A276P001.pdf")
Best way to solve the problem is use textract
module from python and load hindi test data from its github repository and write the extracted text to a txt file. This solved my problem.