c++pdfpodofo

Copy only necessary objects from PDF file


I've got a huge PDF file with more than 100 pages and I want to separate them to single PDF files (containing only one page each). Problem is, that PoDoFo does not copy just the page, but the whole document because of the references (and so each of the 100 PDF files have same size as the 100-page PDF). A relevant mailing list post can be found, unfortunately there is no solution provided.

In source code of function InsertPages there is explanation:

This function works a bit different than one might expect. Rather than copying one page at a time - we copy the ENTIRE document and then delete the pages we aren't interested in.

We do this because
1) SIGNIFICANTLY simplifies the process
2) Guarantees that shared objects aren't copied multiple times
3) offers MUCH faster performance for the common cases

HOWEVER: because PoDoFo doesn't currently do any sort of "object garbage collection" during a Write() - we will end up with larger documents, since the data from unused pages will also be in there.

I have tried few methods to copy only relevant objects, but each of them failed.

but none of them worked out. Does anybody have an idea or working solution for this problem?


Solution

  • Dennis from the support sent me an working example of optimized version of InsertPages function which is actually fixing page references and decreases document size significantly!

    void PdfMemDocument::InsertPages2(const PdfMemDocument & rDoc, std::vector<int> pageNumbers)
    {
        std::unordered_set<PdfObject*> totalSet;
        std::vector<pdf_objnum> oldObjNumPages;
        std::unordered_map<pdf_objnum, pdf_objnum> oldObjNumToNewObjNum;
    
        std::vector<PdfObject*> newPageObjects;
    
        // Collect all dependencies from all pages that are to be copied
        for (int i = 0; i < pageNumbers.size(); ++i) {
            PdfPage* page = rDoc.GetPage(pageNumbers[i]);
            if (page) {
                oldObjNumPages.push_back(page->GetObject()->Reference().ObjectNumber());
                std::unordered_set<PdfObject*> *set = page->GetPageDependencies();
                totalSet.insert(set->begin(), set->end());
                delete set;
            }
        }
    
        // Create a new page object for every copied page from the old document
        // Copy all objects the pages depend on to the new document
        for (auto it = totalSet.begin(); it != totalSet.end(); ++it) {
            unsigned int length = static_cast<unsigned int>(GetObjects().GetSize() + GetObjects().GetFreeObjects().size());
            PdfReference ref(static_cast<unsigned int>(length+1), 0);
            PdfObject* pObj = new PdfObject(ref, *(*it));
            pObj->SetOwner(&(GetObjects()));
            if ((*it)->HasStream()) {
                PdfStream *stream = (*it)->GetStream();
                pdf_long length;
                char* buf;
                stream->GetCopy(&buf, &length);
                PdfMemoryInputStream inputStream(buf, length);
                pObj->GetStream()->SetRawData(&inputStream, length);
                free(buf);
    
            }
            oldObjNumToNewObjNum.insert(std::pair<pdf_objnum, pdf_objnum>((*it)->Reference().ObjectNumber(), length+1));
            GetObjects().push_back(pObj);
            newPageObjects.push_back(pObj);
        }
    
        // In all copied objects, fix the object numbers so they are valid in the new document
        for (auto it = newPageObjects.begin(); it != newPageObjects.end(); ++it) {
            FixPageReferences(GetObjects(), *it, oldObjNumToNewObjNum);
        }
    
        // Insert the copied pages into the pages tree
        for (auto it = oldObjNumPages.begin(); it != oldObjNumPages.end(); ++it) {
            PdfObject* pageObject = GetObjects().GetObject(PdfReference(oldObjNumToNewObjNum[(*it)], 0));
            PdfPage *page = new PdfPage(pageObject, std::deque<PdfObject*>());
            GetPagesTree()->InsertPage(GetPageCount() - 1, page);
        }
    
    }
    
    std::unordered_set<PdfObject *>* PdfPage::GetPageDependencies() const
    {
        std::unordered_set<PdfObject *> *set = new std::unordered_set<PdfObject *>();
    
        const PdfObject* pageObj = GetObject();
        if (pageObj) {
            PdfVecObjects* objects = pageObj->GetOwner();
            if (objects) {
                set->insert((PdfObject*)pageObj);
                objects->GetObjectDependencies2(pageObj, *set);
            }
        }
    
        return set;
    }
    
    // Optimized version of PdfVecObjects::GetObjectDependencies
    void PdfVecObjects::GetObjectDependencies2(const PdfObject* pObj, std::unordered_set<PdfObject*> &refMap) const
    {
        // Check objects referenced from this object
        if (pObj->IsReference())
        {
            PdfObject* referencedObject = GetObject(pObj->GetReference());
            if (referencedObject != NULL && refMap.count(referencedObject) < 1) {
                (refMap).insert((PdfObject *)referencedObject); // Insert referenced object
                GetObjectDependencies2((const PdfObject*)referencedObject, refMap);
            }
        }
        else {
            // Recursion
            if (pObj->IsArray())
            {
                PdfArray::const_iterator itArray = pObj->GetArray().begin();
                while (itArray != pObj->GetArray().end())
                {
                    GetObjectDependencies2(&(*itArray), refMap);
                    ++itArray;
                }
            }
            else if (pObj->IsDictionary())
            {
                TCIKeyMap itKeys = pObj->GetDictionary().GetKeys().begin();
                while (itKeys != pObj->GetDictionary().GetKeys().end())
                {
                    if ((*itKeys).first != PdfName("Parent")) {
                        GetObjectDependencies2((*itKeys).second, refMap);
                    }
                    ++itKeys;
                }
            }
        }
    }
    
    void FixPageReferences(PdfVecObjects& objects, PdfObject* pObject, std::unordered_map<pdf_objnum, pdf_objnum>& oldNumToNewNum) {
        if( !pObject)
        {
            PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
        }
        if( pObject->IsDictionary() )
        {
            TKeyMap::iterator it = pObject->GetDictionary().GetKeys().begin();
    
            while( it != pObject->GetDictionary().GetKeys().end() )
            {
                if ((*it).first != PdfName("Parent")) {
                    FixPageReferences(objects, (*it).second, oldNumToNewNum);
                }
                ++it;
            }
        }
        else if( pObject->IsArray() )
        {
            PdfArray::iterator it = pObject->GetArray().begin();
    
            while( it != pObject->GetArray().end() )
            {
                FixPageReferences(objects, &(*it), oldNumToNewNum),
                ++it;
            }
        }
        else if( pObject->IsReference() )
        {
            //PdfObject* referencedObj = objects.GetObject(pObject->GetReference());
    
            pdf_objnum oldnum = pObject->GetReference().ObjectNumber();
            pdf_objnum newnum = oldNumToNewNum[oldnum];
    
            if (!newnum) throw new std::exception("No new object number for old object number");
    
            *pObject = PdfReference(newnum, 0);
    
        }
    }