javajava-8alfrescozipopencmis

How to build a zip file with a size of 400 GB in java


I need to download all the documents from an alfresco site that contains 400GB of documents. The code below is ok for create a small zip file (about 1GB) otherwise it takes too much memory. I would not like to keep ZipOutputStream in memory, i would like to use memory only for every document copied to the Zip file or use a temporary file that is overwritten for each document.

What is the best practice for this kind of problem?

This piece of code is called from my main:

FolderImpl sitoFolder = (FolderImpl) cmisObject;

List<Tree<FileableCmisObject>> sitoFolderDescendants = sitoFolder.getDescendants(-1);

byte[] zipFile = createZipFILE(sitoFolderDescendants);
String rootPath = cartella_download_file;
File dir = new File(rootPath + File.separator);
if (!dir.exists()) {
   dir.mkdirs();
}
Date date = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String stringDate = sdf.format(date);
String nameZipFile = sitoFolder.getName().replaceAll("\\s","");
File serverFile = new File(dir.getAbsolutePath() + File.separator + stringDate+"_"+nameZipFile+".zip");
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(serverFile));
IOUtils.write(zipFile, bufferedOutputStream);
bufferedOutputStream.close();

//Returns the zip file
private byte[] createZipFILE(List<Tree<FileableCmisObject>> list) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ByteTransform byteTransform = new ByteTransform();
    try {
        ReportDocument reportDocument = new ReportDocument();
        ZipOutputStream zos = new ZipOutputStream(baos);
        for (Tree<FileableCmisObject> aList : list) {
            traverseTree(aList, zos, reportDocument);
        }
        zos.close();
        return baos.toByteArray();
    } catch (IOException exc) {
        reportLog.error(exc.getMessage());
    } finally {
        baos.close();
    }
    return new byte[0];
}

private void traverseTree(Tree<FileableCmisObject> tree, ZipOutputStream zos, ReportDocument reportDocument) {
    for (int i=0; i<tree.getChildren().size(); i++) {
        Tree<FileableCmisObject> child = tree.getChildren().get(i);
        if (CmisUtil.isDocument(child.getItem())) {
            Document document = (Document) child.getItem();
            try {
                addToZipFile(document, zos);
            } catch (IOException ioExc) {
                appLog.error(ioExc.getMessage());
            }
        } else if(CmisUtil.isFolder(child.getItem())) {
            Folder folder = (Folder) child.getItem();
            if (folder.getChildren().getTotalNumItems() == 0) {
                try {
                    addToZipFolder(folder, zos);
                } catch (IOException ioExc) {
                    appLog.error(ioExc.getMessage());
                }
            }
        }
        traverseTree(child, zos, reportDocument);
    }
}

//Service method to add documents to the zip file
private void addToZipFile(Document document, ZipOutputStream zos) throws IOException {
    InputStream inputStream = document.getContentStream().getStream();
    String path = document.getPaths().get(0).replace(sito_export_path, "");       
    ZipEntry zipEntry = new ZipEntry(path);
    zos.putNextEntry(zipEntry);
    IOUtils.copy(inputStream, zos, 1024);
    inputStream.close();
    zos.closeEntry();
}

//Service method to add empty folder to the zip file
private void addToZipFolder(Folder folder, ZipOutputStream zos) throws IOException {
    String path = folder.getPaths().get(0).replace(sito_export_path, "");
    ZipEntry zipEntry = new ZipEntry(path.concat("/"));
    zos.putNextEntry(zipEntry);
}

Solution

  • I solved it. I first created a directory on the server and then created the zip file on this directory directly.

    The error was to save all the files first on: ByteArrayOutputStream and then on the zip file.

    File serverFile = new File(dir.getAbsolutePath() + File.separator + stringDate+"_"+nameZipFile+".zip");
    FileOutputStream fileOutputStream = new FileOutputStream(serverFile);
    ZipArchiveOutputStream zos = new ZipArchiveOutputStream(fileOutputStream);
    for (Tree<FileableCmisObject> aList : sitoFolderDescendants) {
       traverseTree(aList, zos, reportDocument);
    }
    zos.close();
    

    In the finally block I close the FileOutputStream. Than I changed the services method using: ZipArchiveOutputStream and ZipArchiveEntry.

    private void addToZipFolder(Folder folder, ZipArchiveOutputStream zos) throws IOException {
        String path = folder.getPaths().get(0).replace(sito_export_path, "");
        ZipArchiveEntry zipEntry = new ZipArchiveEntry(path.concat("/"));
        appLog.info("aggiungo cartella vuota "+folder.getName()+" al file zip");
        zos.putArchiveEntry(zipEntry);
        zos.closeArchiveEntry();
    }
    
    private void addToZipFile(Document document, ZipArchiveOutputStream zos) throws IOException {
        InputStream inputStream = document.getContentStream().getStream();
        String path = document.getPaths().get(0).replace(sito_export_path, "");
        ZipArchiveEntry entry = new ZipArchiveEntry(path);
        entry.setSize(document.getContentStreamLength());
        zos.putArchiveEntry(entry);
        byte buffer[] = new byte[1024];
        while (true) {
            int nRead = inputStream.read(buffer, 0, buffer.length);
            if (nRead <= 0) {
                break;
            }
            zos.write(buffer, 0, nRead);
        }
        inputStream.close();
        zos.closeArchiveEntry();
    }