pythonexcelopenpyxl

How can I get each page of "Page Break Preview" in Excel using Python?


Problem

I'm trying to get each page of "Page Break Preview". Precisely speaking, I need to know which cells Page 1 contains, which cells Page 2 contains and so on.

Could you teach me how to realize this functionality in Python 3.12? I tried openpyxl, but other libraries are welcome too.

Example (What I Tried)

I find openpyxl has print_area variable, but this variable only returns the whole print area, which contains all pages, so I can't know which page each cell belongs to.

As you can see the code and the screenshot, the result is A1:R50, which contains both Page 1 and Page 2.

In this example, the expected result is something like {'Page1': 'A1:I50', 'Page2':'J1:R50'}.

from openpyxl import load_workbook

workbook = load_workbook(filename="./sample_loader_data/sample.xlsx", 
                         read_only=True, 
                         data_only=True, 
                         )
workbook.worksheets[0].print_area # Result: "'Test worksheet'!$A$1:$R$50"

enter image description here


Solution

  • As commented here, this is not possible with openpyxl

    UPDATE: As @dmjy correctly answered, it is now possible with openpyxl.

    from openpyxl import load_workbook
    from openpyxl.utils.cell import get_column_letter
    import re
    
    workbook = load_workbook(filename="Demo.xlsx")
    
    vertical_breaks = [0]+[page_break.id for page_break in workbook.worksheets[0].col_breaks.brk]
    horizontal_breaks = [0]+[page_break.id for page_break in workbook.worksheets[0].row_breaks.brk]
    
    last_cell = workbook.worksheets[0].dimensions.split(":")[1]
    last_col = ''.join(re.findall(r'[a-zA-Z]+', last_cell))
    last_row = ''.join(re.findall(r'\d+', last_cell))
    
    pages = {}
    cnt=1
    for v, v_break in enumerate(vertical_breaks):
        for h, h_break in enumerate(horizontal_breaks):
            start_col = get_column_letter(vertical_breaks[v]+1)
            start_row = horizontal_breaks[h]+1
            end_col = get_column_letter(vertical_breaks[v+1]) if v+1 < len(vertical_breaks) else last_col
            end_row = horizontal_breaks[h+1] if h+1 < len(horizontal_breaks) else last_row
            pages[f"Page{cnt}"] = f"{start_col}{start_row}:{end_col}{end_row}"
            cnt+=1
    
    print(pages)
    

    For this example

    enter image description here

    It correctly outputs

    {'Page1': 'A1:F27', 'Page2': 'A28:F54', 'Page3': 'G1:N27', 'Page4': 'G28:N54', 'Page5': 'O1:T27', 'Page6': 'O28:T54'}


    Alternative with xlrd for .xls files.

    import xlrd
    
    def num_to_excel_col(n):
        if n < 1:
            raise ValueError("Number must be positive")
        result = ""
        while True:
            if n > 26:
                n, r = divmod(n - 1, 26)
                result = chr(r + ord('A')) + result
            else:
                return chr(n + ord('A') - 1) + result
    
    
    workbook = xlrd.open_workbook("Demo.xls", formatting_info=True)
    worksheet = workbook.sheets()[0]
    
    vertical_breaks = worksheet.vertical_page_breaks
    vertical_breaks.insert(0, (0,0,0))
    horizontal_breaks = worksheet.horizontal_page_breaks
    horizontal_breaks.insert(0, (0,0,0))
    
    last_row = worksheet.nrows
    last_col = worksheet.ncols
    
    pages = {}
    cnt=1
    for v, v_break in enumerate(vertical_breaks):
        for h, h_break in enumerate(horizontal_breaks):
            start_col = num_to_excel_col(vertical_breaks[v][0]+1)
            start_row = horizontal_breaks[h][0]+1
            end_col = num_to_excel_col(vertical_breaks[v+1][0] if v+1 < len(vertical_breaks) else last_col)
            end_row = horizontal_breaks[h+1][0] if h+1 < len(horizontal_breaks) else last_row
            pages[f"Page{cnt}"] = f"{start_col}{start_row}:{end_col}{end_row}"
            cnt+=1
    
    print(pages)
    

    {'Page1': 'A1:F27', 'Page2': 'A28:F54', 'Page3': 'G1:N27', 'Page4': 'G28:N54', 'Page5': 'O1:T27', 'Page6': 'O28:T54'}