pythonpdfarabicpypdftext-extraction

Extracting Arabic data from PDF using PyPDF2


I wanted to write in python3 a function to extract data from Arabic pdf file that has 235 pages and size of 13.6mb focusing on extracting data from page 51 to 67 inclusive then filter the extracted data based on specific inputs and then export the final filtered data to excel.

The issue now is that the excel file is empty and when I tried to investigate, I found that the 'df' did not print correctly in it Arabic form but in reversed rather messy shape.

Here is the code I have written so far...

from PyPDF2 import PdfReader
import pandas as pd
from pandas import ExcelWriter
from io import StringIO
from pdfminer.high_level import extract_text
from pathlib import Path
import fitz
import arabic_reshaper
from bidi.algorithm import get_display

def reshape_text(text):
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text

def flat_filter():
    # try:
        print("Starting PDF processing...")
        pdf_path = Path(r"C:\Users\Documents\Personal\Python\pd\char.pdf")

        if not pdf_path.exists():
            raise FileNotFoundError(f"Error: PDF file not found at path: {pdf_path}")
        start_page = 51
        end_page = 67
        
        with open(pdf_path, "rb") as pdf_file:
            chunks = [] #store the upcoming dfs
            for page_num in range(start_page, end_page + 1):
                # page = doc.load_page(page_num)
                # text = page.get_text("text")
                # page = pdf_reader.pages[page_num]
                # text = page.extract_text()
                text = extract_text(pdf_path, page_numbers=[page_num], maxpages=1, password="")
                # print(text)
                filtered_text = [line for line in text.splitlines() if
                                 not any(word.lower() in line.lower() for word in ["a", "130", "الأرضي"])]
                # print(filtered_text)
                chunk_df = pd.DataFrame([line.split() for line in filtered_text])
                chunks.append(chunk_df)
            df = pd.concat(chunks, ignore_index=True)
            # print("DataFrame shape:", df.shape)
            # print("DataFrame columns:", df.columns)
            #print(df)

        columns_to_filter = [2, 4, 5, 7]  #(0-based)
        filter_conditions = []  #store the filter conditions

        while True:
            condition = input("Enter a filter condition for column {}:..".format(columns_to_filter[len(filter_conditions)]+1))
            filter_conditions.append(condition)
            choice = input("Do you want to add another filter condition? (yes/no): ")
            if choice.lower() == 'no':
                for col_index, condition in zip(columns_to_filter, filter_conditions):
                    filter_combined = " & ".join(filter_conditions)
                    df = df[(df.iloc[:, columns_to_filter].fillna('').apply(lambda x: x.str.contains(filter_combined)).all(axis=1))]

                print("Exporting DataFrame to Excel...")
                # Export DataFrame to Excel file
                writer = pd.ExcelWriter("FLAT.xlsx", engine='openpyxl')
                df = df.map(reshape_text)
                df.to_excel(writer, sheet_name="Data", index=True)
                writer.close()
                print("Successfully exported filtered data to FLAT.xlsx!")
                break

    # except Exception as e:
        # print(e)

flat_filter()

Image of the data I need to extract:

pic of the data I need to extract


Solution

  • This should get you started. It will:-

    Following this you can sort/filter the data as you require.

    There is some issues with the headers in terms of Arabic text so you might need to handle these separately. You could use the df.rename() method to update these. My assumption is the headers are the same on all pages extracted.

    import pandas as pd
    import fitz
    
    pdf_file = r'arabic_doc.pdf'
    
    list_of_list = []
    doc = fitz.open(pdf_file)
    for page in doc.pages(51, 67):
        tabs = page.find_tables()
        if tabs.tables:
            list_of_list.append(tabs[0].extract())
           
    
    df = pd.concat([pd.DataFrame(d) for d in list_of_list])
    df.columns = df.iloc[0]
    df = df.drop([0], axis=0)
    df = df.rename_axis('page_index').reset_index()
    df.head()
    
    

    Edit - I am not able to copy the text to Notepad which suggests there are font issues as suggested by Andj. So you might be best just dropping them and adding new headers. enter image description here