Here is the program that called the files through folder name and extract data. Now i want to compare the data with the keywords that I used in the program below. But it gives me:
pdfReader = pdfFileObj.loadPage(0)
AttributeError: '_io.BufferedReader' object has no attribute 'loadPage'
I want to remove the error and compare the key words with the extracted data. I used PyMuPDF Library for this program.
import fitz
import os
pdfFiles = []
for filename in os.listdir('resume/'):
if filename.endswith('.pdf'):
print(filename)
# pdfFiles.append(filename)
os.chdir('C:/Users/M. Abrar Hussain/Desktop/cv/resume')
print('Current working dir : %s' % os.getcwd())
pdfFileObj = open(filename, 'rb')
pdfReader = pdfFileObj.loadPage(0)
with fitz.open(pdfFileObj) as doc:
text = ""
for page in doc:
text += page.getText()
print(text)
# split the docs
pageObj = pdfReader.getpage(0)
t1 = (pageObj.getText())
t1 = t1.split(",")
search_keywords = ['python', 'Laravel', 'Java']
for sentence in t1:
lst = []
for word in search_keywords:
if word in search_keywords:
list.append(word)
print('{0} key word(s) in sentence: {1}'.format(len(lst), ', '.join(lst)))
pdfFileObj.close()
You missed two lines: import PyPDF2
and pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
Notice that getPage(0)
will return page number 0 object, in your for loop you are constantly reading the same page, if you want to read every iteration new page you should check how many pages there is in the doc and create i parameter that runs from 0 to pdfReader.numPages
.
import fitz
import os
import PyPDF2
pdfFiles = []
for filename in os.listdir('resume/'):
if filename.endswith('.pdf'):
print(filename)
# pdfFiles.append(filename)
os.chdir('C:/Users/M. Abrar Hussain/Desktop/cv/resume')
print('Current working dir : %s' % os.getcwd())
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
with fitz.open(pdfFileObj) as doc:
text = ""
for page in doc:
text += page.getText()
print(text)
# split the docs
pageObj = pdfReader.getPage(0)
t1 = (pageObj.getText())
t1 = t1.split(",")
search_keywords = ['python', 'Laravel', 'Java']
for sentence in t1:
lst = []
for word in search_keywords:
if word in search_keywords:
list.append(word)
print('{0} key word(s) in sentence: {1}'.format(len(lst), ', '.join(lst)))
pdfFileObj.close()