The script below extracts a PDF table using pdfpumbler. It is a table that always comes after the second appearance of the expression "Quadro de Definições" - and a table that is 12 pages long. The script works, but some fields are missing content, such as: “Custodiante”, “Fundo”, “Gestora”, “Escriturador”...
Here is the original PDF. It is in Brazilian Portuguese And here is the CSV I generated
Please, does anyone know what strategy I could use to capture all the information?
import logging
import pdfplumber
import pandas as pd
import os
import re
# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'
def extract_tables_from_pdf(pdf_path):
"""
Extract tables from the specified PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
list: A list of extracted tables, or None if extraction fails.
"""
# Check if the PDF file exists
if not os.path.exists(pdf_path):
logging.error(f"The file {pdf_path} does not exist.")
return None
try:
# Open the PDF file using pdfplumber
with pdfplumber.open(pdf_path) as pdf:
logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")
# Find the second occurrence of "Quadro de Definições"
page_with_second_occurrence = None
occurrences = 0
for page_num, page in enumerate(pdf.pages, start=1):
if "Quadro de Definições" in page.extract_text():
occurrences += 1
if occurrences == 2:
page_with_second_occurrence = page_num
break
# Check if we found the second occurrence
if page_with_second_occurrence is None:
logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
return None
logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")
# Define the range of pages to extract (12 pages after the second occurrence)
start_page = page_with_second_occurrence
end_page = min(start_page + 12, len(pdf.pages))
logging.info(f"Extracting tables from pages {start_page} to {end_page}")
# Extract tables from the specified range of pages
tables = []
for page in pdf.pages[start_page-1:end_page]:
page_tables = page.extract_tables()
if page_tables:
tables.extend(page_tables)
logging.info(f"Number of tables extracted: {len(tables)}")
return tables
except Exception as e:
logging.error(f"An error occurred while processing the PDF: {str(e)}")
return None
def safe_strip(cell):
"""
Safely strip whitespace from a cell, handling None values.
Args:
cell: The cell content to strip.
Returns:
str: The stripped string or an empty string if cell is None.
"""
if cell is None:
return ''
return str(cell).strip()
def process_and_combine_tables(tables):
"""
Process and combine all extracted tables into a single DataFrame.
Args:
tables (list): List of tables extracted from the PDF.
Returns:
pandas.DataFrame: A DataFrame containing all processed and combined table data.
"""
processed_tables = []
for table_index, table in enumerate(tables):
# Remove empty rows
table = [row for row in table if row and any(safe_strip(cell) != '' for cell in row)]
# Process each row
processed_rows = []
for row in table:
if len(row) == 1:
# If the row has only one column, split it into two at the first double space
content = safe_strip(row[0])
split_row = re.split(r'\s{2,}', content, maxsplit=1)
processed_rows.append(split_row if len(split_row) == 2 else [split_row[0], ''])
else:
# If the row has multiple columns, take the first two
processed_rows.append([safe_strip(cell) for cell in row[:2]])
processed_tables.extend(processed_rows)
# Create a DataFrame with all processed rows
df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
# Remove rows where the Term is empty
df = df[df['Term'] != '']
# Remove duplicates, keeping the first occurrence
df = df.drop_duplicates(subset='Term', keep='first')
# Set 'Term' as the index of the DataFrame
df.set_index('Term', inplace=True)
return df
if __name__ == "__main__":
# Log the start of the table extraction process
logging.info(f"Starting table extraction from file: {PDF_PATH}")
# Extract tables from the PDF
extracted_tables = extract_tables_from_pdf(PDF_PATH)
if extracted_tables:
try:
# Process and combine the extracted tables
df_combined = process_and_combine_tables(extracted_tables)
print(df_combined)
# Save the combined DataFrame to a CSV file
csv_path = 'combined definitions framework.csv'
df_combined.to_csv(csv_path)
logging.info(f"Combined DataFrame saved to '{csv_path}'")
except Exception as e:
logging.error(f"Error processing and combining tables: {str(e)}")
else:
logging.warning("Could not extract tables from the PDF.")
# Log the completion of the extraction and combination process
logging.info("Table extraction and combination process completed.")
The solution is to improve the table extraction by playing with its settings.
Here is a revised version of you script:
import logging
import pdfplumber
import pandas as pd
import os
# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'
# default settings for table extraction from: https://pypi.org/project/pdfplumber/#Table-extraction%20settings
# settings = {
# "vertical_strategy": "lines",
# "horizontal_strategy": "lines",
# "explicit_vertical_lines": [],
# "explicit_horizontal_lines": [],
# "snap_tolerance": 3,
# "snap_x_tolerance": 3,
# "snap_y_tolerance": 3,
# "join_tolerance": 3,
# "join_x_tolerance": 3,
# "join_y_tolerance": 3,
# "edge_min_length": 3,
# "min_words_vertical": 3,
# "min_words_horizontal": 3,
# "intersection_tolerance": 3,
# "intersection_x_tolerance": 3,
# "intersection_y_tolerance": 3,
# "text_tolerance": 3,
# "text_x_tolerance": 3,
# "text_y_tolerance": 3
# }
# Custom settings for table extraction
settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"snap_x_tolerance": 20, # impacts number of columns
"snap_y_tolerance": 2.5, # impacts rows alignment and number or rows
"intersection_x_tolerance": 1 # impacts row sep
}
def extract_tables_from_pdf(pdf_path):
"""
Extract tables from the specified PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
list: A list of extracted tables, or None if extraction fails.
"""
# Check if the PDF file exists
if not os.path.exists(pdf_path):
logging.error(f"The file {pdf_path} does not exist.")
return None
try:
# Open the PDF file using pdfplumber
with pdfplumber.open(pdf_path) as pdf:
logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")
# Find the second occurrence of "Quadro de Definições"
page_with_second_occurrence = None
occurrences = 0
for page_num, page in enumerate(pdf.pages, start=1):
if "Quadro de Definições" in page.extract_text():
occurrences += 1
if occurrences == 2:
page_with_second_occurrence = page_num
break
# Check if we found the second occurrence
if page_with_second_occurrence is None:
logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
return None
logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")
# Define the range of pages to extract (12 pages after the second occurrence)
start_page = page_with_second_occurrence
end_page = min(start_page + 12, len(pdf.pages))
logging.info(f"Extracting tables from pages {start_page} to {end_page}")
# Extract tables from the specified range of pages
tables = []
for page in pdf.pages[start_page-1:end_page]:
page_tables = page.extract_tables(table_settings=settings)
if page_tables:
tables.extend(page_tables)
logging.info(f"Number of tables extracted: {len(tables)}")
return tables
except Exception as e:
logging.error(f"An error occurred while processing the PDF: {str(e)}")
return None
def process_and_combine_tables(tables):
"""
Process and combine all extracted tables into a single DataFrame.
Args:
tables (list): List of tables extracted from the PDF.
Returns:
pandas.DataFrame: A DataFrame containing all processed and combined table data.
"""
processed_tables = []
# regroup tables
regr_tables = []
for table in tables:
for row in table:
regr_tables.append(row)
table = regr_tables
# regroup definition splitted by page break
for i in range(len(table)-2, -1, -1): # iterate from last to first to avoid index offset when deleting list'elements.
if table[i+1][0]=='':
table[i][1]+= f"\n{table[i+1][1]}"
del table[i+1] # deleting list's element
processed_tables = table
# Create a DataFrame with all processed rows
df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
# Set 'Term' as the index of the DataFrame
df.set_index('Term', inplace=True)
return df
if __name__ == "__main__":
# Log the start of the table extraction process
logging.info(f"Starting table extraction from file: {PDF_PATH}")
# Extract tables from the PDF
extracted_tables = extract_tables_from_pdf(PDF_PATH)
if extracted_tables:
try:
# Process and combine the extracted tables
df_combined = process_and_combine_tables(extracted_tables)
print(df_combined)
# Save the combined DataFrame to a CSV file
csv_path = 'combined definitions framework.csv'
df_combined.to_csv(csv_path)
logging.info(f"Combined DataFrame saved to '{csv_path}'")
except Exception as e:
logging.error(f"Error processing and combining tables: {str(e)}")
else:
logging.warning("Could not extract tables from the PDF.")
# Log the completion of the extraction and combination process
logging.info("Table extraction and combination process completed.")