iosswiftavfoundationios-vision

TrueDepth Camera Pixel Distance Inaccuracies


I'm leveraging the front-facing TrueDepth camera in combination with Vision to recognize points in the image and run some measurements. I understand Vision coordinates are normalized, so I'm converting the Vision normalized points to CGPoints corresponding to the View, then looking to match those to the depthData in dataOutputSynchronizer to get the z value. Then using the camera intrinsics I'm attempting to get a distance between 2 points in 3D space.

I have successfully found the points and (I believe) converted them to screen points. My thinking here is that these CGPoints would be no different than if I tapped them on the screen.

My issue is that even though the converted CGPoints remain mostly similar (my hand does move around a little during testing but remains mostly planar to the camera ) and I'm attempting to calculate the depth position in the same way, the depths can be wildly different - especially point 2. Depth Point 2 seems more accurate in terms of calculated distance (my hand is about 1 foot from the camera) but it varies a lot and still is not accurate.

Here is a console print with relevant data

there are 2 points found
recognized points
[(499.08930909633636, 634.0807711283367), (543.7462849617004, 1061.8824380238852)]
DEPTH POINT 1 =  3.6312041
DEPTH POINT 2 =  0.2998223

there are 2 points found
recognized points
[(498.33644700050354, 681.3769372304281), (602.3667773008347, 1130.4955183664956)]
DEPTH POINT 1 =  3.6276162
DEPTH POINT 2 =  0.560331

Here is some of the relevant code.

dataOutputSynchronizer

func dataOutputSynchronizer(_ synchronizer: AVCaptureDataOutputSynchronizer,
                                didOutput synchronizedDataCollection: AVCaptureSynchronizedDataCollection) {
        
        var handPoints: [CGPoint] = []
        
        // Read all outputs
        guard renderingEnabled,
            let syncedDepthData: AVCaptureSynchronizedDepthData =
            synchronizedDataCollection.synchronizedData(for: depthDataOutput) as? AVCaptureSynchronizedDepthData,
            let syncedVideoData: AVCaptureSynchronizedSampleBufferData =
            synchronizedDataCollection.synchronizedData(for: videoDataOutput) as? AVCaptureSynchronizedSampleBufferData else {
                // only work on synced pairs
                return
        }
        
        if syncedDepthData.depthDataWasDropped || syncedVideoData.sampleBufferWasDropped {
            return
        }
        
        let depthPixelBuffer = syncedDepthData.depthData.depthDataMap
        guard let videoPixelBuffer = CMSampleBufferGetImageBuffer(syncedVideoData.sampleBuffer) else {
            return
        }
        
        // Get the cameraIntrinsics
        guard let  cameraIntrinsics = syncedDepthData.depthData.cameraCalibrationData?.intrinsicMatrix else {
            return
        }
        
        let image = CIImage(cvPixelBuffer: videoPixelBuffer)
        
        let handler = VNImageRequestHandler(
           cmSampleBuffer: syncedVideoData.sampleBuffer,
           orientation: .up,
           options: [:]
         )
        
         do {
           try handler.perform([handPoseRequest])
           guard
             let results = handPoseRequest.results?.prefix(2),
             !results.isEmpty
           else {
             return
           }

            var recognizedPoints: [VNRecognizedPoint] = []

             try results.forEach { observation in
               let fingers = try observation.recognizedPoints(.all)

               if let middleTipPoint = fingers[.middleDIP] {
                 recognizedPoints.append(middleTipPoint)
               }

               if let wristPoint = fingers[.wrist] {
                 recognizedPoints.append(wristPoint)
               }
             }

             // Store the Points in handPoints if they are confident points
             handPoints = recognizedPoints.filter {
               $0.confidence > 0.90
             }
             .map {
               // Adjust the Y
               CGPoint(x: $0.location.x, y: 1 - $0.location.y)
             }
             
             // Process the Points Found
             DispatchQueue.main.sync {
              self.processPoints(handPoints,depthPixelBuffer,videoPixelBuffer,cameraIntrinsics)
             }
         } catch {
             // Be more graceful here 
         }
    }

Process Points

func processPoints(_ handPoints: [CGPoint],_ depthPixelBuffer: CVImageBuffer,_ videoPixelBuffer: CVImageBuffer,_ cameraIntrinsics: simd_float3x3) {

        // This converts the normalized point to screen points
        // cameraView.previewLayer is a AVCaptureVideoPreviewLayer inside a UIView
        let convertedPoints = handPoints.map {
            cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
        }
       
        // We need 2 hand points to get the distance 
        if handPoints.count == 2 {
            print("there are 2 points found");
            print("recognized points")
            print(convertedPoints)
            
            let handVisionPoint1 = convertedPoints[0]
        
            let handVisionPoint2 = convertedPoints[1]
            
            let scaleFactor = CGFloat(CVPixelBufferGetWidth(depthPixelBuffer)) / CGFloat(CVPixelBufferGetWidth(videoPixelBuffer))
            
            CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
            let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
            
            let width = CVPixelBufferGetWidth(depthPixelBuffer)
            let height = CVPixelBufferGetHeight(depthPixelBuffer)
            
            let handVisionPixelX = Int((handVisionPoint1.x * scaleFactor).rounded())
            let handVisionPixelY = Int((handVisionPoint1.y * scaleFactor).rounded())
            
            let handVisionPixe2X = Int((handVisionPoint2.x * scaleFactor).rounded())
            let handVisionPixe2Y = Int((handVisionPoint2.y * scaleFactor).rounded())
            
            CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
            
            let rowDataPoint1 = CVPixelBufferGetBaseAddress(depthPixelBuffer)! + handVisionPixelY * CVPixelBufferGetBytesPerRow(depthPixelBuffer)
            let handVisionPoint1Depth = rowDataPoint1.assumingMemoryBound(to: Float32.self)[handVisionPixelX]
            
            print("DEPTH POINT 1 = ", handVisionPoint1Depth)
            
            let rowDataPoint2 = CVPixelBufferGetBaseAddress(depthPixelBuffer)! + handVisionPixe2Y * CVPixelBufferGetBytesPerRow(depthPixelBuffer)
            let handVisionPoint2Depth = rowDataPoint2.assumingMemoryBound(to: Float32.self)[handVisionPixe2X]
            
            print("DEPTH POINT 2 = ", handVisionPoint2Depth)
            //Int((width - touchPoint.x) * (height - touchPoint.y))
}

In my mind right now I'm thinking my logic for finding the correct pixel in the depth map is incorrect. If that is not the case, then I'm wondering if the data stream is out of synch. But honestly, I'm just a little lost at the moment. Thanks for any assistance!


Solution

  • The answer ended up being fairly straightforward. I have Reality-Dev's post on Apple Dev forums (and his Body Tracking git) to thank for showing the way.

    In this line, I was converting the Normalized Vision points to the screen:

     let convertedPoints = handPoints.map {
                cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
            }
    

    This was the main problem. I needed the normalized points. A very rookie mistake. This code provides accurate depthmap distances to Vision points:

    if handPoints.count == 2 {
                
                let handVisionPoint1 = handPoints[0]
                let handVisionPoint2 = handPoints[1]
                
               
                CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
                let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
                
                let width = CVPixelBufferGetWidth(depthPixelBuffer)
                let height = CVPixelBufferGetHeight(depthPixelBuffer)
                
                
                let colPosition1 = Int(handVisionPoint1.x * CGFloat(width))
                let rowPosition1 = Int(handVisionPoint1.y * CGFloat(height))
                
                let colPosition2 = Int(handVisionPoint2.x * CGFloat(width))
                let rowPosition2 = Int(handVisionPoint2.y * CGFloat(height))
    
                
                guard CVPixelBufferGetPixelFormatType(depthPixelBuffer) == kCVPixelFormatType_DepthFloat32 else { return }
    
                        CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
    
                        if let baseAddress = CVPixelBufferGetBaseAddress(depthPixelBuffer) {
    
                                let width = CVPixelBufferGetWidth(depthPixelBuffer)
    
                                let index1 = colPosition1 + (rowPosition1 * width)
                                let index2 = colPosition2 + (rowPosition1 * width)
    
                                let offset1 = index1 * MemoryLayout<Float>.stride
                                let offset2 = index2 * MemoryLayout<Float>.stride
    
                                let distanceValue1 = baseAddress.load(fromByteOffset: offset1, as: Float.self)
                                let distanceValue2 = baseAddress.load(fromByteOffset: offset2, as: Float.self)
                            
                            CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly
    
                        }
                
                CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly)
    
                
            }