pythonzipepubpython-zipfile

Zipfile not compressing some SVG files in Python


I've been making an script to transform Latex code to SVG images in epubs. The idea is to extract an epub in a temp directory, to find the code and to create SVG and to substitute the code for the links to the SVG images, then compress back everything again.

Everything works fine but the final compression. It compresses everything but the new SVG I've created(I've checked they are in the Images folder of the temporal uncompressed epub). Here is a minimal working example:

import zipfile
import os
import shutil

def create_minimal_uncompressed_epub(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory, ignore_errors=True)
    os.makedirs(directory)
    with open(os.path.join(directory, 'mimetype'), 'w') as mimetype:
        mimetype.write('application/epub+zip')
    os.makedirs(os.path.join(directory, 'META-INF'))
    os.makedirs(os.path.join(directory, 'OEBPS'))
    with open(os.path.join(directory, 'META-INF', 'container.xml'), 'w') as container_xml:
        data = ('<?xml version="1.0"?>'
                '<container version="1.0" xmlns="urn:oasis:names:'
                'tc:opendocument:xmlns:container">'
                '<rootfiles>'
                '<rootfile full-path="OEBPS/content.opf" media-type='
                '"application/oebps-package+xml"/>'
                '</rootfiles>'
                '</container>')
        container_xml.write(data)
    with open(os.path.join(directory, 'OEBPS', 'content.opf'), 'w') as content_opf:
        data = ('<?xml version="1.0" encoding="UTF-8" ?><package xmlns='
                '"http://www.idpf.org/2007/opf" xmlns:dc="http://purl.o'
                'rg/dc/elements/1.1/" unique-identifier="db-id" version'
                '="3.0"><metadata><dc:title id="t1">Title</dc:title><dc'
                ':identifier id="db-id">isbn</dc:identifier><meta   pro'
                'perty="dcterms:modified">2014-03-27T09:14:09Z</meta><d'
                'c:language>en</dc:language></metadata><manifest><item '
                'id="toc" properties="nav" href="toc.xhtml" media-type='
                '"application/xhtml+xml" /><item id="ncx" href="toc.ncx'
                '" media-type="application/x-dtbncx+xml" /><item id="te'
                'mplate_css" href="template.css" media-type="text/css" '
                '/><item id="hello" href="1_hello.xhtml" media-type="ap'
                'plication/xhtml+xml" /></manifest><spine toc="ncx"><it'
                'emref idref="hello" /></spine></package>')
        content_opf.write(data)
    with open(os.path.join(directory, 'OEBPS', 'toc.xhtml'), 'w') as toc_xhtml:
        data = ('<?xml version="1.0" encoding="utf-8"?><html '
                'xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="htt'
                'p://www.idpf.org/2007/ops"><head><title>toc.xhtml</t'
                'itle><link href="template.css" rel="stylesheet" type'
                '="text/css" /></head><body><nav id="toc" epub:type="'
                'toc"><h1 class="frontmatter">Table of Contents</h1><'
                'ol class="contents"><li><a href="1_hello.xhtml">Hell'
                'o</a></li></ol></nav></body></html>')
        toc_xhtml.write(data)
    with open(os.path.join(directory, 'OEBPS', 'toc.ncx'), 'w') as toc_ncx:
        data = ('<?xml version="1.0" encoding="UTF-8" ?><ncx version="2005'
                '-1" xml:lang="en" xmlns="http://www.daisy.org/z3986/2005/'
                'ncx/"><head><meta name="dtb:uid" content="isbn"/><meta na'
                'me="dtb:depth" content="1"/></head><docTitle><text></text'
                '></docTitle><navMap><navPoint id="hello" playOrder="1"><n'
                'avLabel><text>cover</text></navLabel><content src="1_hell'
                'o.xhtml" /></navPoint></navMap></ncx>')
        toc_ncx.write(data)
    with open(os.path.join(directory, 'OEBPS', '1_hello.xhtml'), 'w') as hello_xhtml:
        data = ('<?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.or'
                'g/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"><head><titl'
                'e>1_hello.xhtml</title><link href="template.css" rel="stylesheet" t'
                'ype="text/css" /></head><body><h1>Hello World!</h1></body></html> ')
        hello_xhtml.write(data)
    with open(os.path.join(directory, 'OEBPS', 'template.css'), 'w') as templace_css:
        data = ('h1 {text-align: center;}')
        templace_css.write(data)

def recursive_zip(zipf, directory, folder=None):
    nodes = os.listdir(directory)
    print nodes
    for item in nodes:
        if os.path.isfile(os.path.join(directory, item)):
            zipf.write(os.path.join(directory, item), os.path.join(folder, item), zipfile.ZIP_DEFLATED)
        elif os.path.isdir(os.path.join(directory, item)):
            recursive_zip(zipf, os.path.join(directory, item), os.path.join(folder, item))

def create_svg():
    return 'code here\n'

TEMP_DIR = 'minimal_temp_dir'
SVG_FILENAME = 'minimal_svg_filename.svg'
create_minimal_uncompressed_epub(TEMP_DIR)
with open(os.path.join(TEMP_DIR, 'OEBPS', SVG_FILENAME), 'w') as svgfile:
    svgfile.write(create_svg())
try:
    MINIMAL_EPUB = 'minimal_epub.epub'
    ZIPF = zipfile.ZipFile(MINIMAL_EPUB, 'w')
    ZIPF.write(os.path.join(TEMP_DIR, 'mimetype'), 'mimetype', zipfile.ZIP_STORED)
    for item in os.listdir(TEMP_DIR):
        if os.path.isdir(os.path.join(TEMP_DIR, item)):
            recursive_zip(ZIPF, os.path.join(TEMP_DIR, item), item)
    ZIPF.close()
except: #IOError
    print('\nError compressing file')

The function recursive_zip actually finds every file(notice 'print nodes' inside it). No idea why the svg files are missing. No errors. The svg files are in the temp folder but not in the compressed final version when I open it with Sigil.


Solution

  • I finally figure out what's happening. I found a list of images in the file content.opf, in a tag named manifest, and according to the International Digital Publishing Forum:

    The required manifest must provide a list of all the files that are part of the publication (e.g. Content Documents, style sheets, image files, any embedded font files, any included schemas).

    So the files were actually getting compressed and included in the zip file, but since it was renamed as .epub and opened with Sigil, the SVG images didn't show because they were not included into the file content.opf.