pythonzugferd

Extracting embedded XML File from PDF ZUGFERD


I need to get embedded xml from my pdf file, I read that the embedded file should be in /EmbeddedFiles if i try to get these dates through this site https://eforms.aloaha.com/zugferd.aspx Then I got an xml file, but if I try to get this data through my function, the program write -> The specified PDF file does not contain an embedded ZUGFeRD XML file.

this is what i tried

 def extract_zugferd_xml(pdf_file_path):
        """
        Extracts the ZUGFeRD XML from a PDF file and returns it as a string.
    
        Args:
            pdf_file_path (str): The path to the PDF file to extract the XML from.
    
        Returns:
            str: The ZUGFeRD XML as a string.
        """
        # Open the PDF file in binary mode using PyPDF2
        with open(pdf_file_path, 'rb') as pdf_file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(pdf_file)
    
            # Get the number of pages in the PDF file
            # num_pages = pdf_reader.getNumPages()
            num_pages = len(pdf_reader.pages)
    
            # Loop through each page in the PDF file
            for page_num in range(num_pages):
                # Get the page object for the current page
                # page = pdf_reader.getPage(page_num)
                page =  pdf_reader.pages[page_num]
    
                print("PAGE ", page)
                print("MediaBox ", page['/MediaBox'])
                print("Parent ", page['/Parent']['/Kids'][0])
                print("Parent type", type(page['/Parent']['/Kids']))
    
                # Create a PDF reader object
                pdf_reader = PyPDF2.PdfReader(pdf_file)
    
                # Get the indirect reference object
                indirect_ref = page['/Parent']['/Kids'][0]
    
                # Get the referenced object directly
                ref_obj = pdf_reader.get_object(indirect_ref)
    
                # Print the content of the object
                print("ref_obj ", ref_obj)
    
                # Check if the page has any embedded files
                if '/EmbeddedFiles' in page:
                    # Get the embedded files dictionary
                    embedded_files = page['/EmbeddedFiles']
    
                    # Loop through each entry in the embedded files dictionary
                    for file_name, file_obj in embedded_files.items():
                        # Check if the file is a ZUGFeRD XML file
                        if file_name.endswith('.xml') and 'ZUGFeRD' in file_obj.getData():
                            # Extract and return the XML data as a string
                            return file_obj.getData().decode('utf-8')
    
        # If the function reaches this point, it means the PDF file does not contain an embedded ZUGFeRD XML file
        raise ValueError('The specified PDF file does not contain an embedded ZUGFeRD XML file.')

print is ->

PAGE  {'/Type': '/Page', '/MediaBox': [0, 0, 595, 842], '/Rotate': 0, '/Parent': IndirectObject(3, 0, 4370055712), '/Resources': {'/ProcSet': ['/PDF', '/Text'], '/Font': IndirectObject(21, 0, 4370055712)}, '/Contents': IndirectObject(11, 0, 4370055712)}
MediaBox  [0, 0, 595, 842]
Parent  IndirectObject(10, 0, 4370055712)
Parent type <class 'PyPDF2.generic._data_structures.ArrayObject'>

Why? or where is /EmbeddedFiles in PDF


Solution

  • I found a solution it turns out that embedded pdf files are hidden in specific attributes of pdf files in my mz files you can find them at the path /Root/Names/EmbeddedFiles/Names or /Root/Names/EmbeddedFiles/Kids here is the full code

    def get_attachments(reader):
        """
              Retrieves the file attachments of the PDF as a dictionary of file names
              and the file data as a bytestring.
              :return: dictionary of filenames and bytestrings
              """
        try:
            attachments = {}
            catalog = reader.trailer["/Root"]
            try:
                file_names = catalog['/Names']['/EmbeddedFiles']['/Names'] # korrect
            except (Exception,):
                file_names = catalog['/Names']['/EmbeddedFiles']['/Kids']
            # xml nam amd xml file
            if len(file_names) == 2:
                for f in file_names:
                    if isinstance(f, str):
                        name = f
                        data_index = file_names.index(f) + 1
                        f_dict = file_names[data_index].get_object()
                        f_data = f_dict['/EF']['/F'].get_data()
                        attachments[name] = f_data
    
                return attachments
            # only xml file
            elif len(file_names) == 1:
                for f in file_names:
                    data_index = file_names.index(f)
                    f_dict = file_names[data_index].get_object()
                    name = f_dict['/Names'][0]
                    f_dict = f_dict['/Names'][1].get_object()    
    
                    f_data = f_dict['/EF']['/F'].get_data()
    
                    attachments[name] = f_data
                return attachments
    
    
        except Exception as e:
            print(f"Exception {e}")
            return {}
    
    
    pdf_file = open('Testinvoice.pdf', 'rb')
    
    
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    xml_data = get_attachments(pdf_reader)