swiftcore-audioaudiotoolboxaudioqueuesfspeechrecognizer

Using AudioToolbox instead of AVFoundation in SFSpeechRecognizer


I have to use AudioToolbox instead AVAudioSession for providing stream to SFSpeechRecognizer. I know that I should use AudioQueue, so I made an audio recording export to CMSampleBuffer to read it with recognizer. And while debugging I see that the buffer is added to SFSpeechAudioBufferRecognitionRequest, but the code in the task closure doesn't execute: neither result, nor error.

What's wrong with the code?

let NUM_BUFFERS = 1
struct RecordState {
    var dataFormat = AudioStreamBasicDescription()
    var queue: AudioQueueRef?
    var buffers: [AudioQueueBufferRef] = []
    var audioFile: AudioFileID?
    var currentPacket: Int64 = 0
    var recording = false
}

func сallback(_ inUserData: UnsafeMutableRawPointer?,
              _ inAQ: AudioQueueRef,
              _ inBuffer: AudioQueueBufferRef,
              _ inStartTime: UnsafePointer<AudioTimeStamp>,
              _ inNumberPacketDescriptions: UInt32,
              _ inPacketDescs: UnsafePointer<AudioStreamPacketDescription>?) {
    let recordState = inUserData?.assumingMemoryBound(to: RecordState.self)

    if let queue = recordState?.pointee.queue {
        AudioQueueEnqueueBuffer(queue, inBuffer, 0, nil)

        let rec = AudioRecorder.sharedInstance
        rec.transformBuffer(pBuffer: inBuffer, pLength: inBuffer.pointee.mAudioDataByteSize)
    }
}


class AudioRecorder: NSObject, ObservableObject, SFSpeechRecognizerDelegate {
    let format = AudioStreamBasicDescription(mSampleRate: Float64(16000.0), mFormatID: kAudioFormatLinearPCM, mFormatFlags: kAudioFormatFlagsNativeFloatPacked, mBytesPerPacket: UInt32(MemoryLayout<Float32>.size), mFramesPerPacket: 1, mBytesPerFrame: UInt32(MemoryLayout<Float32>.size), mChannelsPerFrame: 1, mBitsPerChannel: UInt32(MemoryLayout<Float32>.size * 8), mReserved: 0)
    var recordState = RecordState()
    var startTime = CFAbsoluteTimeGetCurrent()

    static var sharedInstance = AudioRecorder()

    private var speechRecognizer = SFSpeechRecognizer()!
    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
    private var recognitionTask: SFSpeechRecognitionTask?

    private var engineEnabled = false
    private var lastText = [SFTranscriptionSegment]()

    override init() {
        super.init()

        OperationQueue.main.addOperation {
            SFSpeechRecognizer.requestAuthorization { authStatus in
                switch authStatus {
                case .authorized:
                    self.engineEnabled = true
                default:
                    self.engineEnabled = false
                }
            }
        }

        self.speechRecognizer.delegate = self
    }

    func startRecording() {
        recordState.dataFormat = format

        var queue: AudioQueueRef?
        if AudioQueueNewInput(&recordState.dataFormat, сallback, &recordState, CFRunLoopGetCurrent(), CFRunLoopMode.commonModes.rawValue, 0, &queue) == noErr {
            recordState.queue = queue
        } else {
            return
        }

        for _ in 0..<NUM_BUFFERS {
            var buffer: AudioQueueBufferRef?
            if AudioQueueAllocateBuffer(queue!, 1024, &buffer) == noErr {
                recordState.buffers.append(buffer!)
            }
            AudioQueueEnqueueBuffer(queue!, buffer!, 0, nil)
        }

        recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
        guard let recognitionRequest = recognitionRequest else { fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object") }
        recognitionRequest.shouldReportPartialResults = true

        // Keep speech recognition data on device
        if #available(iOS 13, *) {
            recognitionRequest.requiresOnDeviceRecognition = true
        }

        recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
            var isFinal = false

            if let result = result {
               print(result.bestTranscription.formattedString)

                isFinal = result.isFinal
            }

            if error != nil || isFinal {
                // Stop recognizing speech if there is a problem.

                self.recognitionRequest = nil
                self.recognitionTask = nil
            }
        }

        recordState.recording = true

        if AudioQueueStart(recordState.queue!, nil) != noErr {
           fatalError("Something is wrong")
        }
        self.startTime = CFAbsoluteTimeGetCurrent()
    }

    func stopRecording() {
        recordState.recording = false

        AudioQueueStop(recordState.queue!, true)

        for i in 0..<NUM_BUFFERS {
            if let buffers = recordState.buffers[i] as? AudioQueueBufferRef {
                AudioQueueFreeBuffer(recordState.queue!, buffers)
            }
        }

        AudioQueueDispose(recordState.queue!, true)
        if let file = recordState.audioFile {
            AudioFileClose(file)
        }
    }

    func transformBuffer(pBuffer: AudioQueueBufferRef, pLength: UInt32) {
        var blockBuffer: CMBlockBuffer?
        CMBlockBufferCreateWithMemoryBlock(allocator: kCFAllocatorDefault, memoryBlock: pBuffer, blockLength: Int(pLength), blockAllocator: kCFAllocatorNull, customBlockSource: nil, offsetToData: 0, dataLength: Int(pLength), flags: kCMBlockBufferAssureMemoryNowFlag, blockBufferOut: &blockBuffer)

        let timeFormat = format.mSampleRate
        let currentTime = CFAbsoluteTimeGetCurrent()
        let elapsedTime: CFTimeInterval = currentTime - self.startTime
        let timeStamp = CMTimeMake(value: Int64(elapsedTime * timeFormat), timescale: Int32(timeFormat))

        let nSamples = Int(pLength / format.mBytesPerFrame)
        do {
            let formatDescription = try CMAudioFormatDescription(audioStreamBasicDescription: format)

            var sampleBuffer: CMSampleBuffer?
            CMAudioSampleBufferCreateWithPacketDescriptions(allocator: kCFAllocatorDefault, dataBuffer: blockBuffer, dataReady: true, makeDataReadyCallback: nil, refcon: nil, formatDescription: formatDescription, sampleCount: nSamples, presentationTimeStamp: timeStamp, packetDescriptions: nil, sampleBufferOut: &sampleBuffer)
            if let sBuffer = sampleBuffer {
                self.recognitionRequest?.appendAudioSampleBuffer(sBuffer)
            }
        } catch {
            fatalError(error.localizedDescription)
        }
    }
}

UPD: I modified the code so it could be more descriptive


Solution

  • Finally, I've found the answer. Here's the code for the conversion of AudioQueueBufferRef into AVAudioPCMBuffer:

    func queueBufferToAudioBuffer(_ buffer:  AudioQueueBufferRef) -> AVAudioPCMBuffer? {
            guard let audioFormat = AVAudioFormat(
                commonFormat: .pcmFormatFloat32,
                sampleRate: format.mSampleRate,
                channels: format.mChannelsPerFrame,
                interleaved: true)
            else { return nil }
    
            let frameLength = buffer.pointee.mAudioDataBytesCapacity / audioFormat.streamDescription.pointee.mBytesPerFrame
    
            guard let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: frameLength) else { return nil }
    
            audioBuffer.frameLength = frameLength
            let dstLeft = audioBuffer.floatChannelData![0]
    
            let src = buffer.pointee.mAudioData.bindMemory(to: Float.self, capacity: Int(frameLength))
            dstLeft.initialize(from: src, count: Int(frameLength))
    
            return audioBuffer
        }