iosswiftfirebasefirebase-mlkittext-recognition

Swift, Firebase - Use CMSampleBufferRef with live feed of camera


I'm currently trying to implement the MLKit from Firebase to use text recognition.

So far, I've got the code for the camera, which shows its live feed inside of an UIView. My intention is now to recognize text in this live feed, which I reckon is possible with the help of CMSampleBufferRef (let image = VisionImage(buffer: bufferRef) - see linked Firebase tutorial, Step 2).
How am I able to create such CMSampleBufferRef and to make it hold the live feed of the camera (UIView)?

My code for the camera:

@IBOutlet weak var cameraView: UIView!
    var session: AVCaptureSession?
    var device: AVCaptureDevice?
    var input: AVCaptureDeviceInput?
    var output: AVCaptureMetadataOutput?
    var prevLayer: AVCaptureVideoPreviewLayer?

    override func viewDidLoad() {
        super.viewDidLoad()
        prevLayer?.frame.size = cameraView.frame.size
    }

    func createSession() {
        session = AVCaptureSession()
        device = AVCaptureDevice.default(for: AVMediaType.video)

        do{
            input = try AVCaptureDeviceInput(device: device!)
        }
        catch{
            print(error)
        }

        if let input = input{
            session?.addInput(input)
        }

        prevLayer = AVCaptureVideoPreviewLayer(session: session!)
        prevLayer?.frame.size = cameraView.frame.size
        prevLayer?.videoGravity = AVLayerVideoGravity.resizeAspectFill

        prevLayer?.connection?.videoOrientation = transformOrientation(orientation: UIInterfaceOrientation(rawValue: UIApplication.shared.statusBarOrientation.rawValue)!)

        cameraView.layer.addSublayer(prevLayer!)

        session?.startRunning()
    }

    func cameraWithPosition(position: AVCaptureDevice.Position) -> AVCaptureDevice? {
        let deviceDiscoverySession = AVCaptureDevice.DiscoverySession(deviceTypes: [.builtInDualCamera, .builtInTelephotoCamera, .builtInTrueDepthCamera, .builtInWideAngleCamera, ], mediaType: .video, position: position)

        if let device = deviceDiscoverySession.devices.first {
            return device
        }
        return nil
    }

    override func viewWillTransition(to size: CGSize, with coordinator: UIViewControllerTransitionCoordinator) {
        coordinator.animate(alongsideTransition: { (context) -> Void in
            self.prevLayer?.connection?.videoOrientation = self.transformOrientation(orientation: UIInterfaceOrientation(rawValue: UIApplication.shared.statusBarOrientation.rawValue)!)
            self.prevLayer?.frame.size = self.cameraView.frame.size
        }, completion: { (context) -> Void in

        })
        super.viewWillTransition(to: size, with: coordinator)
    }

    func transformOrientation(orientation: UIInterfaceOrientation) -> AVCaptureVideoOrientation {
        switch orientation {
        case .landscapeLeft:
            return .landscapeLeft
        case .landscapeRight:
            return .landscapeRight
        case .portraitUpsideDown:
            return .portraitUpsideDown
        default:
            return .portrait
        }
    }

Solution

  • Edit: I have added a functional Swift sample matching your language requirement:

    import UIKit
    import AVFoundation
    
    class ViewController: UIViewController, AVCaptureVideoDataOutputSampleBufferDelegate {
        @IBOutlet weak var cameraView: UIView!
        var session: AVCaptureSession!
        var device: AVCaptureDevice?
        var input: AVCaptureDeviceInput?
        var videoOutput: AVCaptureVideoDataOutput!
        var output: AVCaptureMetadataOutput?
        var prevLayer: AVCaptureVideoPreviewLayer!
        
        override func viewDidLoad() {
            super.viewDidLoad()
            
            session = AVCaptureSession()
            device = AVCaptureDevice.default(for: AVMediaType.video)
            
            do{
                input = try AVCaptureDeviceInput(device: device!)
            }
            catch{
                print(error)
                return
            }
            
            if let input = input {
                if session.canAddInput(input) {
                    session.addInput(input)
                }
            }
            
            videoOutput = AVCaptureVideoDataOutput()
            videoOutput.videoSettings = [
                String(kCVPixelBufferPixelFormatTypeKey): NSNumber(value: kCVPixelFormatType_32BGRA)
            ]
            videoOutput.alwaysDiscardsLateVideoFrames = true
            
            let queue = DispatchQueue(label: "video-frame-sampler")
            videoOutput!.setSampleBufferDelegate(self, queue: queue)
            if session.canAddOutput(videoOutput) {
                session.addOutput(videoOutput)
                
                if let connection = videoOutput.connection(with: .video) {
                    connection.videoOrientation = videoOrientationFromInterfaceOrientation()
                    
                    if connection.isVideoStabilizationSupported {
                        connection.preferredVideoStabilizationMode = .auto
                    }
                }
            }
            
            prevLayer = AVCaptureVideoPreviewLayer(session: session)
            prevLayer.frame.size = cameraView.frame.size
            prevLayer.videoGravity = AVLayerVideoGravity.resizeAspectFill
            cameraView.layer.addSublayer(prevLayer!)
            
            session.startRunning()
        }
        
        func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
            //pass your sampleBuffer to vision API
            //I recommend not to pass every frame however, skip some frames until camera is steady and focused
            print("frame received")
        }
        
        func videoOrientationFromInterfaceOrientation() -> AVCaptureVideoOrientation {
            return AVCaptureVideoOrientation(rawValue: UIApplication.shared.statusBarOrientation.rawValue)!
        }
    }
    

    I see that you already have set up your input and preview layer but you need to set up your video capture output, as well, to capture your CMSampleBufferRef frames.

    To do this set up an object of type AVCaptureVideoDataOutput with the following steps:

    1. Create instance of AVCaptureVideoDataOutput and configure

       AVCaptureVideoDataOutput* videoOutput = [[AVCaptureVideoDataOutput new] autorelease];
       videoOutput.videoSettings = @{(id)kCVPixelBufferPixelFormatTypeKey:@(kCVPixelFormatType_32BGRA)};
       videoOutput.alwaysDiscardsLateVideoFrames = YES;
      
    2. Set up frame capture (sample buffer) delegate of the configured output and add it to the session

       dispatch_queue_t queue = dispatch_queue_create("video-frame-sampler", 0);
       [videoOutput setSampleBufferDelegate:self queue:queue];
       if ([self.session canAddOutput:videoOutput]) {
           [self.session addOutput:videoOutput];
      
           AVCaptureConnection* connection = [videoOutput connectionWithMediaType:AVMediaTypeVideo];
           connection.videoOrientation = [self videoOrientationFromDeviceOrientation];
           if (connection.supportsVideoStabilization) {
               connection.preferredVideoStabilizationMode = AVCaptureVideoStabilizationModeAuto;
           }
       }
      
    3. Implement captureOutput:didOutputSampleBuffer:fromConnection: method where you are going to get your required CMSampleBufferRef

       -(void)captureOutput:(AVCaptureOutput *)captureOutput didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer fromConnection:(AVCaptureConnection *)connection {
           //pass your sampleBuffer to vision API
           //I recommend not to pass every frame however, skip some frames until camera is steady and focused
       }
      

    I'm a plain old Objective-C developer, but you can easily convert the code to Swift as per your need.

    Additionally, here is the code for videoOrientationFromDeviceOrientation method:

    -(AVCaptureVideoOrientation)videoOrientationFromDeviceOrientation {
        UIDeviceOrientation orientation = [UIDevice currentDevice].orientation;
        AVCaptureVideoOrientation result = (AVCaptureVideoOrientation)orientation;
        if ( orientation == UIDeviceOrientationLandscapeLeft )
            result = AVCaptureVideoOrientationLandscapeRight;
        else if ( orientation == UIDeviceOrientationLandscapeRight )
            result = AVCaptureVideoOrientationLandscapeLeft;
        return result;
    }