
Real-time AVAssetWriter synchronise audio and video when pausing/resuming

I am trying to record a video with sound using iPhone's front camera. As I need to also support pause/resume functionality, I need to use AVAssetWriter. I've found an example online, written in Objective-C, which almost achieves the desired functionality (

Unfortunately, after converting this example to Swift, I notice that if I pause/resume, at the end of each "section" there is a small but noticeable period during which the video is just a still frame and the audio is playing. So, it seems that when isPaused is triggered, the recorded audio track is longer than the recorded video track.

Sorry if it may seem like a noob question, but I am not a great expert in AVFoundation and some help would be appreciated!

Below I post my implementation of didOutput sampleBuffer.

func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
    var isVideo = true
    if videoConntection != connection {
        isVideo = false
    if (!isCapturing || isPaused) {

    if (encoder == nil) {
        if isVideo {
        if let fmt = CMSampleBufferGetFormatDescription(sampleBuffer) {
            let desc = CMAudioFormatDescriptionGetStreamBasicDescription(fmt as CMAudioFormatDescription)
            if let chan = desc?.pointee.mChannelsPerFrame, let rate = desc?.pointee.mSampleRate {
                let path = tempPath()!
                encoder = VideoEncoder(path: path, height: Int(cameraSize.height), width: Int(cameraSize.width), channels: chan, rate: rate)
    if discont {
        if isVideo {
        discont = false
        var pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
        let last = lastAudio
        if last.flags.contains(CMTimeFlags.valid) {
            if cmOffset.flags.contains(CMTimeFlags.valid) {
                pts = CMTimeSubtract(pts, cmOffset)
            let off = CMTimeSubtract(pts, last)
            print("setting offset from \(isVideo ? "video":"audio")")
            print("adding \(CMTimeGetSeconds(off)) to \(CMTimeGetSeconds(cmOffset)) (pts \(CMTimeGetSeconds(cmOffset)))")
            if cmOffset.value == 0 {
                cmOffset = off
            else {
                cmOffset = CMTimeAdd(cmOffset, off)
        lastVideo.flags = []
        lastAudio.flags = []
    var out:CMSampleBuffer?
    if cmOffset.value > 0 {
        var count:CMItemCount = CMSampleBufferGetNumSamples(sampleBuffer)
        let pInfo = UnsafeMutablePointer<CMSampleTimingInfo>.allocate(capacity: count)
        CMSampleBufferGetSampleTimingInfoArray(sampleBuffer, entryCount: count, arrayToFill: pInfo, entriesNeededOut: &count)
        var i = 0
        while i<count {
            pInfo[i].decodeTimeStamp = CMTimeSubtract(pInfo[i].decodeTimeStamp, cmOffset)
            pInfo[i].presentationTimeStamp = CMTimeSubtract(pInfo[i].presentationTimeStamp, cmOffset)
        CMSampleBufferCreateCopyWithNewTiming(allocator: nil, sampleBuffer: sampleBuffer, sampleTimingEntryCount: count, sampleTimingArray: pInfo, sampleBufferOut: &out)
    else {
        out = sampleBuffer
    var pts = CMSampleBufferGetPresentationTimeStamp(out!)
    let dur = CMSampleBufferGetDuration(out!)
    if (dur.value > 0)
        pts = CMTimeAdd(pts, dur);
    if (isVideo) {
        lastVideo = pts;
    else {
        lastAudio = pts;
    encoder?.encodeFrame(sampleBuffer: out!, isVideo: isVideo)

And this is my VideoEncoder class:

final class VideoEncoder {
    var writer:AVAssetWriter
    var videoInput:AVAssetWriterInput
    var audioInput:AVAssetWriterInput
    var path:String

    init(path:String, height:Int, width:Int, channels:UInt32, rate:Float64) {
        self.path = path
        if FileManager.default.fileExists(atPath:path) {
            try? FileManager.default.removeItem(atPath: path)
        let url = URL(fileURLWithPath: path)
        writer = try! AVAssetWriter(outputURL: url, fileType: .mp4)
        videoInput = AVAssetWriterInput(mediaType: .video, outputSettings: [
            AVVideoCodecKey: AVVideoCodecType.h264,
        videoInput.expectsMediaDataInRealTime = true

        audioInput = AVAssetWriterInput(mediaType: .audio, outputSettings: [
        audioInput.expectsMediaDataInRealTime = true

    func finish(with completionHandler:@escaping ()->Void) {
        writer.finishWriting(completionHandler: completionHandler)

    func encodeFrame(sampleBuffer:CMSampleBuffer, isVideo:Bool) -> Bool {
        if CMSampleBufferDataIsReady(sampleBuffer) {
            if writer.status == .unknown {
                writer.startSession(atSourceTime: CMSampleBufferGetPresentationTimeStamp(sampleBuffer))
            if writer.status == .failed {
                QFLogger.shared.addLog(format: "[ERROR initiating AVAssetWriter]", args: [], error: writer.error)
                return false
            if isVideo {
                if videoInput.isReadyForMoreMediaData {
                    return true
            else {
                if audioInput.isReadyForMoreMediaData {
                    return true
        return false

The rest of the code should be pretty obvious, but just to make it complete, here is what I have for pausing:

isPaused = true
discont = true

And here is resume:

isPaused = false

If anyone could help me to understand how to align video and audio tracks during such live recording that would be great!


  • Ok, turns out there was no mistake in the code which I provided. The issue which I experienced was caused by a video smoothing which was turned ON :) I guess it needs extra frames to smooth the video, which is why the video output freezes at the end for a short period of time.