I'm trying to use Swift to parse the contents of PDF documents, following Apple's programming guide (in which all the examples are ObjC...)
let filepath = "/Users/ben/Desktop/Test.pdf"
let localUrl = filepath as CFString
if let pdfURL = CFURLCreateWithFileSystemPath(nil, localUrl, CFURLPathStyle.cfurlposixPathStyle, false) {
if let pdf = CGPDFDocument(pdfURL) {
if let inf = pdf.info {
CGPDFDictionaryApplyFunction(inf, { (key, object, info) -> Void in
print("\(key), \(object), \(info)")
}, nil)
}
if let cat = pdf.catalog {
CGPDFDictionaryApplyFunction(cat, { (key, object, info) -> Void in
print("\(key), \(object), \(info)")
}, nil)
}
}
}
While this seems to produce some results, it's just strings of hex digits.
0x00007ff29f43ce00, 0x00007ff29f492bd0, nil
0x00007ff29f443b60, 0x00007ff29f492cd0, nil
0x00007ff29f482590, 0x00007ff29f492dd0, nil
0x00007ff29f482a40, 0x00007ff29f492ed0, nil
0x00007ff29f482e30, 0x00007ff29f492fe0, nil
0x00007ff29f47da20, 0x00007ff29f4930e0, nil
0x00007ff29f474ac0, 0x00007ff29f842b50, nil
0x00007ff29f43f5d0, 0x00007ff29f842bf0, nil
0x00007ff29f485eb0, 0x00007ff29f842a60, nil
0x00007ff29f482f70, 0x00007ff29f842ab0, nil
0x00007ff29f48b1c0, 0x00007ff29f48f6d0, nil
So how do I get the actual data? Ideally, I'm trying to get at the document metadata and things like fonts contained.
Your parsing retrieving high level dictionary and info data is correct, but you need to expand the decoding in CGPDFDictionaryApplyFunction to display the values of PDF data according their types (integer, string, array, dictionary, and so on). The syntax of the CGPDFDictionaryApplierFunction you are calling is:
typealias CGPDFDictionaryApplierFunction = (UnsafePointer<Int8>, COpaquePointer, UnsafeMutablePointer<()>) -> Void
Your program is displaying the pointers to the data, you could access the data values according their types as below (Swift 2):
let filepath = "/Users/ben/Desktop/Test.pdf"
let urlDocument = NSURL(fileURLWithPath: filepath)
let myDocument = CGPDFDocumentCreateWithURL(urlDocument)
if myDocument != nil {
let numPages = CGPDFDocumentGetNumberOfPages(myDocument)
print("Number of pages: \(numPages)")
// Get complete catalog
let myCatalog = CGPDFDocumentGetCatalog(myDocument)
CGPDFDictionaryApplyFunction(myCatalog, printPDFKeys, nil)
let myInfo = CGPDFDocumentGetInfo(myDocument)
CGPDFDictionaryApplyFunction(myInfo, printPDFKeys, nil)
} else {
print("Cannot open PDF document")
}
In order to be called from the CGPDFDictionaryApplyFunction, the printPDFKeys is to be called as a global function (outside your main class), alternately you could insert the code in a closure of CGPDFDictionaryApplyFunction as in your example above. The below code is shortened and is not including complete protection against errors and null values.
func printPDFKeys( key: UnsafePointer<Int8>, object: COpaquePointer, info: UnsafeMutablePointer<()>) {
let contentDict: CGPDFDictionaryRef = CGPDFDictionaryRef(info)
let keyString = String(CString: UnsafePointer<CChar>(key), encoding: NSISOLatin1StringEncoding)
let objectType = CGPDFObjectGetType(object)
if keyString == nil {
return
}
print("key \(keyString!) is present in dictionary, type \(objectType.rawValue)")
var ptrObjectValue = UnsafePointer<Int8>()
switch objectType {
// ObjectType is enum of:
// Null
// Boolean
// Integer
// Real
// Name
// String
// Array
// Dictionary
// Stream
case .Boolean:
// Boolean
var objectBoolean = CGPDFBoolean()
if CGPDFObjectGetValue(object, objectType, &objectBoolean) {
let testbool = NSNumber(unsignedChar: objectBoolean)
print("Boolean value \(testbool)")
}
case .Integer:
// Integer
var objectInteger = CGPDFInteger()
if CGPDFObjectGetValue(object, objectType, &objectInteger) {
print("Integer value \(objectInteger)")
}
case .Real:
// Real
var objectReal = CGPDFReal()
if CGPDFObjectGetValue(object, objectType, &objectReal) {
print("Real value \(objectReal)")
}
case .Name:
// Name
if (CGPDFObjectGetValue(object, objectType, &ptrObjectValue)) {
let stringName = String(CString: UnsafePointer<CChar>(ptrObjectValue), encoding: NSISOLatin1StringEncoding)
print("Name value: \(stringName!)")
}
case .String:
// String
let valueFound = CGPDFObjectGetValue(object, objectType, &ptrObjectValue)
let stringValue = CGPDFStringCopyTextString(COpaquePointer(ptrObjectValue))
print("String value: \(stringValue!)")
case .Array:
// Array
print("Array")
var objectArray = CGPDFArrayRef()
if (CGPDFObjectGetValue(object, objectType, &objectArray))
{
print("array: \(arrayFromPDFArray(objectArray))")
}
case .Dictionary:
// Dictionary
var objectDictionary = CGPDFDictionaryRef()
if (CGPDFObjectGetValue(object, objectType, &objectDictionary)) {
let count = CGPDFDictionaryGetCount(objectDictionary)
print("Found dictionary with \(count) entries")
if !(keyString == "Parent") && !(keyString == "P") {
//catalogLevel = catalogLevel + 1
CGPDFDictionaryApplyFunction(objectDictionary, printPDFKeys, nil)
//catalogLevel = catalogLevel - 1
}
}
case .Stream:
// Stream
print("Stream")
var objectStream = CGPDFStreamRef()
if (CGPDFObjectGetValue(object, objectType, &objectStream)) {
let dict: CGPDFDictionaryRef = CGPDFStreamGetDictionary( objectStream )
var fmt: CGPDFDataFormat = .Raw
let streamData: CFDataRef = CGPDFStreamCopyData(objectStream, &fmt)!;
let data = NSData(data: streamData)
let dataString = NSString(data: data, encoding: NSUTF8StringEncoding)
let dataLength: Int = CFDataGetLength(streamData)
print("data stream (length=\(dataLength)):")
if dataLength < 400 {
print(dataString)
}
}
default:
print("Null")
}
}
// convert a PDF array into an objC one
func arrayFromPDFArray(pdfArray: CGPDFArrayRef ) -> NSMutableArray {
var i:Int = 0
var tmpArray: NSMutableArray = NSMutableArray()
let count = CGPDFArrayGetCount(pdfArray)
for i in 0..<count {
var value = CGPDFObjectRef()
if (CGPDFArrayGetObject(pdfArray, i, &value)) {
if let object = objectForPDFObject(value) {
tmpArray.addObject(object)
}
}
}
return tmpArray
}
func objectForPDFObject( object: CGPDFObjectRef) -> AnyObject? {
let objectType: CGPDFObjectType = CGPDFObjectGetType(object)
var ptrObjectValue = UnsafePointer<Int8>()
switch (objectType) {
case .Boolean:
// Boolean
var objectBoolean = CGPDFBoolean()
if CGPDFObjectGetValue(object, objectType, &objectBoolean) {
let testbool = NSNumber(unsignedChar: objectBoolean)
return testbool
}
case .Integer:
// Integer
var objectInteger = CGPDFInteger()
if CGPDFObjectGetValue(object, objectType, &objectInteger) {
return objectInteger
}
case .Real:
// Real
var objectReal = CGPDFReal()
if CGPDFObjectGetValue(object, objectType, &objectReal) {
return objectReal
}
case .String:
let valueFound = CGPDFObjectGetValue(object, objectType, &ptrObjectValue)
let stringValue = CGPDFStringCopyTextString(COpaquePointer(ptrObjectValue))
return stringValue
case .Dictionary:
// Dictionary
var objectDictionary = CGPDFDictionaryRef()
if (CGPDFObjectGetValue(object, objectType, &objectDictionary)) {
let count = CGPDFDictionaryGetCount(objectDictionary)
print("In array, found dictionary with \(count) entries")
CGPDFDictionaryApplyFunction(objectDictionary, printPDFKeys, nil)
}
case .Stream:
// Stream
var objectStream = CGPDFStreamRef()
if (CGPDFObjectGetValue(object, objectType, &objectStream)) {
let dict: CGPDFDictionaryRef = CGPDFStreamGetDictionary( objectStream )
var fmt: CGPDFDataFormat = .Raw
let streamData: CFDataRef = CGPDFStreamCopyData(objectStream, &fmt)!;
let data = NSData(data: streamData)
let dataString = NSString(data: data, encoding: NSUTF8StringEncoding)
print("data stream (length=\(CFDataGetLength(streamData))):")
return dataString
}
default:
return nil
}
return nil
}