swiftcocoacgpdfdocument

swift CGPDFDocument parsing


I'm trying to use Swift to parse the contents of PDF documents, following Apple's programming guide (in which all the examples are ObjC...)

let filepath = "/Users/ben/Desktop/Test.pdf"
let localUrl  = filepath as CFString
if let pdfURL = CFURLCreateWithFileSystemPath(nil, localUrl, CFURLPathStyle.cfurlposixPathStyle, false) {
    if let pdf = CGPDFDocument(pdfURL) {
        if let inf = pdf.info {
            CGPDFDictionaryApplyFunction(inf, { (key, object, info) -> Void in
                print("\(key), \(object), \(info)")
            }, nil)
        }
        if let cat = pdf.catalog {

            CGPDFDictionaryApplyFunction(cat, { (key, object, info) -> Void in
                print("\(key), \(object), \(info)")
            }, nil)

        }
}
}

While this seems to produce some results, it's just strings of hex digits.

0x00007ff29f43ce00, 0x00007ff29f492bd0, nil
0x00007ff29f443b60, 0x00007ff29f492cd0, nil
0x00007ff29f482590, 0x00007ff29f492dd0, nil
0x00007ff29f482a40, 0x00007ff29f492ed0, nil
0x00007ff29f482e30, 0x00007ff29f492fe0, nil
0x00007ff29f47da20, 0x00007ff29f4930e0, nil
0x00007ff29f474ac0, 0x00007ff29f842b50, nil
0x00007ff29f43f5d0, 0x00007ff29f842bf0, nil
0x00007ff29f485eb0, 0x00007ff29f842a60, nil
0x00007ff29f482f70, 0x00007ff29f842ab0, nil
0x00007ff29f48b1c0, 0x00007ff29f48f6d0, nil

So how do I get the actual data? Ideally, I'm trying to get at the document metadata and things like fonts contained.


Solution

  • Your parsing retrieving high level dictionary and info data is correct, but you need to expand the decoding in CGPDFDictionaryApplyFunction to display the values of PDF data according their types (integer, string, array, dictionary, and so on). The syntax of the CGPDFDictionaryApplierFunction you are calling is:

    typealias CGPDFDictionaryApplierFunction = (UnsafePointer<Int8>, COpaquePointer, UnsafeMutablePointer<()>) -> Void

    Your program is displaying the pointers to the data, you could access the data values according their types as below (Swift 2):

        let filepath = "/Users/ben/Desktop/Test.pdf"
        let urlDocument = NSURL(fileURLWithPath: filepath)
        let myDocument = CGPDFDocumentCreateWithURL(urlDocument)
        if myDocument != nil {
            let numPages = CGPDFDocumentGetNumberOfPages(myDocument)
            print("Number of pages: \(numPages)")
            // Get complete catalog
            let myCatalog = CGPDFDocumentGetCatalog(myDocument)
            CGPDFDictionaryApplyFunction(myCatalog, printPDFKeys, nil)
            let myInfo = CGPDFDocumentGetInfo(myDocument)
            CGPDFDictionaryApplyFunction(myInfo, printPDFKeys, nil)
        } else {
            print("Cannot open PDF document")
        }
    

    In order to be called from the CGPDFDictionaryApplyFunction, the printPDFKeys is to be called as a global function (outside your main class), alternately you could insert the code in a closure of CGPDFDictionaryApplyFunction as in your example above. The below code is shortened and is not including complete protection against errors and null values.

    func printPDFKeys( key: UnsafePointer<Int8>, object: COpaquePointer, info: UnsafeMutablePointer<()>) {
        let contentDict: CGPDFDictionaryRef = CGPDFDictionaryRef(info)
        let keyString = String(CString: UnsafePointer<CChar>(key), encoding: NSISOLatin1StringEncoding)
        let objectType = CGPDFObjectGetType(object)
        if keyString == nil {
            return
        }
        print("key \(keyString!) is present in dictionary, type \(objectType.rawValue)")
        var ptrObjectValue = UnsafePointer<Int8>()
        switch objectType {
        // ObjectType is enum of:
        //   Null
        //   Boolean
        //   Integer
        //   Real
        //   Name
        //   String
        //   Array
        //   Dictionary
        //   Stream
        case .Boolean:
            // Boolean
            var objectBoolean = CGPDFBoolean()
            if CGPDFObjectGetValue(object, objectType, &objectBoolean) {
                let testbool = NSNumber(unsignedChar: objectBoolean)
                print("Boolean value \(testbool)")
            }
        case .Integer:
            // Integer
            var objectInteger = CGPDFInteger()
            if CGPDFObjectGetValue(object, objectType, &objectInteger) {
                print("Integer value \(objectInteger)")
            }
        case .Real:
            // Real
            var objectReal = CGPDFReal()
            if CGPDFObjectGetValue(object, objectType, &objectReal) {
                print("Real value \(objectReal)")
            }
        case .Name:
            // Name
            if (CGPDFObjectGetValue(object, objectType, &ptrObjectValue)) {
                let stringName = String(CString: UnsafePointer<CChar>(ptrObjectValue), encoding: NSISOLatin1StringEncoding)
                print("Name value: \(stringName!)")
            }
        case .String:
            // String
            let valueFound = CGPDFObjectGetValue(object, objectType, &ptrObjectValue)
            let stringValue = CGPDFStringCopyTextString(COpaquePointer(ptrObjectValue))
            print("String value: \(stringValue!)")
        case .Array:
            // Array
            print("Array")
            var objectArray = CGPDFArrayRef()
            if (CGPDFObjectGetValue(object, objectType, &objectArray))
            {
                print("array: \(arrayFromPDFArray(objectArray))")
            }
        case .Dictionary:
            // Dictionary
            var objectDictionary = CGPDFDictionaryRef()
            if (CGPDFObjectGetValue(object, objectType, &objectDictionary)) {
                let count = CGPDFDictionaryGetCount(objectDictionary)
                print("Found dictionary with \(count) entries")
                if !(keyString == "Parent") && !(keyString == "P") {
                    //catalogLevel = catalogLevel + 1
                    CGPDFDictionaryApplyFunction(objectDictionary, printPDFKeys, nil)
                    //catalogLevel = catalogLevel - 1
                }
            }
    case .Stream:
        // Stream
        print("Stream")
        var objectStream = CGPDFStreamRef()
        if (CGPDFObjectGetValue(object, objectType, &objectStream)) {
            let dict: CGPDFDictionaryRef = CGPDFStreamGetDictionary( objectStream )
            var fmt: CGPDFDataFormat = .Raw
            let streamData: CFDataRef = CGPDFStreamCopyData(objectStream, &fmt)!;
            let data = NSData(data: streamData)
            let dataString = NSString(data: data, encoding: NSUTF8StringEncoding)
            let dataLength: Int = CFDataGetLength(streamData)
            print("data stream (length=\(dataLength)):")
            if dataLength < 400 {
                print(dataString)
            }
        }
    default:
        print("Null")
    }
    }
    
    // convert a PDF array into an objC one
    func arrayFromPDFArray(pdfArray: CGPDFArrayRef ) -> NSMutableArray {
    var i:Int = 0
    var tmpArray: NSMutableArray = NSMutableArray()
    
    let count = CGPDFArrayGetCount(pdfArray)
    for i in 0..<count {
        var value = CGPDFObjectRef()
        if (CGPDFArrayGetObject(pdfArray, i, &value)) {
            if let object = objectForPDFObject(value) {
                tmpArray.addObject(object)
            }
        }
    }
    
    return tmpArray
    }
    
    func objectForPDFObject( object: CGPDFObjectRef) -> AnyObject? {
    let objectType: CGPDFObjectType = CGPDFObjectGetType(object)
    var ptrObjectValue = UnsafePointer<Int8>()
    switch (objectType) {
    case .Boolean:
        // Boolean
        var objectBoolean = CGPDFBoolean()
        if CGPDFObjectGetValue(object, objectType, &objectBoolean) {
            let testbool = NSNumber(unsignedChar: objectBoolean)
            return testbool
        }
    case .Integer:
        // Integer
        var objectInteger = CGPDFInteger()
        if CGPDFObjectGetValue(object, objectType, &objectInteger) {
            return objectInteger
        }
    case .Real:
        // Real
        var objectReal = CGPDFReal()
        if CGPDFObjectGetValue(object, objectType, &objectReal) {
            return objectReal
        }
    case .String:
        let valueFound = CGPDFObjectGetValue(object, objectType, &ptrObjectValue)
        let stringValue = CGPDFStringCopyTextString(COpaquePointer(ptrObjectValue))
        return stringValue
    case .Dictionary:
        // Dictionary
        var objectDictionary = CGPDFDictionaryRef()
        if (CGPDFObjectGetValue(object, objectType, &objectDictionary)) {
            let count = CGPDFDictionaryGetCount(objectDictionary)
            print("In array, found dictionary with \(count) entries")
            CGPDFDictionaryApplyFunction(objectDictionary, printPDFKeys, nil)
        }
    case .Stream:
        // Stream
        var objectStream = CGPDFStreamRef()
        if (CGPDFObjectGetValue(object, objectType, &objectStream)) {
            let dict: CGPDFDictionaryRef = CGPDFStreamGetDictionary( objectStream )
            var fmt: CGPDFDataFormat = .Raw
            let streamData: CFDataRef = CGPDFStreamCopyData(objectStream, &fmt)!;
            let data = NSData(data: streamData)
            let dataString = NSString(data: data, encoding: NSUTF8StringEncoding)
            print("data stream (length=\(CFDataGetLength(streamData))):")
            return dataString
        }
    default:
        return nil
    }
    return nil
    }