pythonpdfpypdfpdfminertika-python

How to extract text from multiple pdf in a location with specific line and store in Excel?


I have 100 pdf stored in a location and I want to extract text from them and store in excel below is pdf image in this i want (stored in page1)

bid no,end date,item category,organisation name 

enter image description here enter image description here

needed

OEM Average Turnover (Last 3 Years),Years of Past Experience required,MSE Exemption for Years Of Experience
and Turnover,Startup Exemption for Years of Experience
and Turnover,Estimated Bid Value,EMD Required

enter image description here

Consignee address only) enter image description here


Solution

  • Tika is one of the Python packages that you can use to extract the data from your PDF files.

    In the example below I'm using Tika and regular expressions to extract these five data elements:

    import re as regex
    from tika import parser
    
    parse_entire_pdf = parser.from_file('2022251527199.pdf', xmlContent=True)
    for key, values in parse_entire_pdf.items():
        if key == 'content':
            bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
            print(bid_number.group(2))
            GEM/2022/B/1916455
    
            bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
            print(bid_end_date.group(2))
            21-02-2022 15:00:00
    
            org_name = regex.search(r'(Organisation Name)\W(.*)', values)
            print(org_name.group(2))
            State Election Commission (sec), Gujarat
    
            item_category = regex.search(r'(Item Category)\W(.*)', values)
            print(item_category.group(2))
            Desktop Computers (Q2) , Computer Printers (Q2)
     
            total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
            print(total_quantity.group(2))
            18
    
    

    Here is one way to write out the extracted data to a CSV file:

    import csv
    import re as regex
    from tika import parser
    
    document_elements = []
    
    # processing 2 documents 
    documents = ['202225114747453.pdf', '2022251527199.pdf']
    for doc in documents:
        parse_entire_pdf = parser.from_file(doc, xmlContent=True)
        for key, values in parse_entire_pdf.items():
            if key == 'content':
                bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
    
                bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
    
                org_name = regex.search(r'(Organisation Name)\W(.*)', values)
    
                item_category = regex.search(r'(Item Category)\W(.*)', values)
    
                total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
                
                document_elements.append([bid_number.group(2),
                                          bid_end_date.group(2), 
                                          org_name.group(2), 
                                          item_category.group(2),
                                          total_quantity.group(2)])
    
    
    with open("out.csv", "w", newline="") as f:
        headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
        writer = csv.writer(f)
        writer.writerow(headerList)
        writer.writerows(document_elements)
    

    enter image description here

    Here is the additional code that you asked for in the comments.

    import os
    import re as regex
    from tika import parser
    
    document_elements = []
    
    image_directory = "pdf_files"
    image_directory_abspath = os.path.abspath(image_directory)
    for dirpath, dirnames, filenames in os.walk(image_directory_abspath):
        for filename in [f for f in filenames if f.endswith(".pdf")]:
            parse_entire_pdf = parser.from_file(os.path.join(dirpath, filename), xmlContent=True)
            for key, values in parse_entire_pdf.items():
                if key == 'content':
                    bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
    
                    bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
    
                    org_name = regex.search(r'(Organisation Name)\W(.*)', values)
    
                    item_category = regex.search(r'(Item Category)\W(.*)', values)
    
                    total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
    
                    document_elements.append([bid_number.group(2),
                                              bid_end_date.group(2),
                                              org_name.group(2),
                                              item_category.group(2),
                                              total_quantity.group(2)])
    
    with open("out.csv", "w", newline="") as f:
        headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
        writer = csv.writer(f)
        writer.writerow(headerList)
        writer.writerows(document_elements)
    
    

    SPECIAL NOTE: I noted that some PDFs don't have an org_name, so you will have to figure out how to handle these with either a N/A, None, or Null