I need to extract all images from a pdf file with podofo. Extracting all images from the file works well. I used the image extractor example for that. This receives all objects and iterates over them. But I need to iterate over pages and check for image objects on a page. Does anyone know how to do that?
Piggy backing off podofoimgextract, you could iterate each page, get the page resource object, check for an XObject or Image, and from here it's pretty much the exact same code that is used in the image extract utility.
for (int pageN = 0; pageN < document.GetPageCount(); pageN++) {
PdfPage* page = document.GetPage(pageN);
PdfDictionary resource = page->GetResources()->GetDictionary();
for (auto& k : resource.GetKeys()) {
if (k.first.GetName() == "XObject" || k.first.GetName() == "Image") {
if (k.second->IsDictionary()) {
auto targetDict = k.second->GetDictionary();
for (auto& r : k.second->GetDictionary().GetKeys()) {
// The XObject will usually contain indirect objects as it's values.
// Check for a reference
if (r.second->IsReference()) {
// Get the object that is being referenced.
auto target =
document.GetObjects().GetObject(r.second->GetReference());
if (target->IsDictionary()) {
auto targetDict = target->GetDictionary();
auto kf = targetDict.GetKey(PdfName::KeyFilter);
if (!kf)
continue;
if (kf->IsArray() && kf->GetArray().GetSize() == 1 &&
kf->GetArray()[0].IsName() &&
kf->GetArray()[0].GetName().GetName() == "DCTDecode") {
kf = &kf->GetArray()[0];
}
if (kf->IsName() && kf->GetName().GetName() == "DCTDecode") {
ExtractImage(target, true);
} else {
ExtractImage(target, false);
}
}
}
}
}
}
}
}