python-3.10azure-speech

Azure Speech to text with multi channel audio stream-Identify text from each channel


I am using Azure's Speech to text services for transcribing the audio stream. I am able to achieve the results with Diarization as well. However, service has challenges to identify the speaker names correctly.

I have now chosen to take the multi channel audio stream (in this case 2 channel) and would like to differentiate the text coming from different channel. This way, I felt I can achieve the solution for correct speaker names. As I know, Speaker 1 always speaks through channel 1 and speaker 2 through channel 2.

But I dont see the response showing the results with separate channel name. Irrespective of channel from which speech is converted, the result shows as 'Channel 0'.

Can someone help me to get the result so that I can differentiate text based on 'Channel' tag or some other means.

Result JSON:

{"Id":"e84432bb0b3a4283b10914fc528ddbc9","RecognitionStatus":"Success","DisplayText":"SOME TEXT HERE.","Offset":733900000,"Duration":74800000,"Channel":0,"Type":"ConversationTranscription","SpeakerId":"Guest-1"}

{"Id":"33b6600f7ca44f3db4a0fa59be17d403","RecognitionStatus":"Success","DisplayText":"SOME TEXT HERE.","Offset":808700000,"Duration":76800000,"Channel":0,"Type":"ConversationTranscription","SpeakerId":"Guest-2"}

def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
print('Canceled event')

def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
print('SessionStopped event')

def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
try:
    print('TRANSCRIBED:')
    print(evt.result.json)
except Exception:
    print(traceback.format_exc())
    logging.error(traceback.format_exc())


 def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
print('SessionStarted event')


 def recognize_from_file():

speech_config = speechsdk.SpeechConfig(
    subscription=speech_key, region=service_region)
speech_config.speech_recognition_language = "en-US"


audio_config = speechsdk.audio.AudioConfig(
    filename=audio_filename)
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
    speech_config=speech_config, audio_config=audio_config)

transcribing_stop = False

def stop_cb(evt: speechsdk.SessionEventArgs):
    # """callback that signals to stop continuous recognition upon receiving an event `evt`"""
    print('CLOSING on {}'.format(evt))
    nonlocal transcribing_stop
    transcribing_stop = True

# Connect callbacks to the events fired by the conversation transcriber
conversation_transcriber.transcribed.connect(
    conversation_transcriber_transcribed_cb)
conversation_transcriber.session_started.connect(
    conversation_transcriber_session_started_cb)
conversation_transcriber.session_stopped.connect(
    conversation_transcriber_session_stopped_cb)
conversation_transcriber.canceled.connect(
    conversation_transcriber_recognition_canceled_cb)
# stop transcribing on either session stopped or canceled events
conversation_transcriber.session_stopped.connect(stop_cb)
conversation_transcriber.canceled.connect(stop_cb)

conversation_transcriber.start_transcribing_async()

# Waits for completion.
while not transcribing_stop:
    time.sleep(10)

conversation_transcriber.stop_transcribing_async()

Solution

  • To obtain the correct channel numbers (1 and 2) instead of channel 0 in the JSON output, we must manually modify the JSON data after parsing it to replace the channel number. Here is the modified complete code.

    Code :

    import azure.cognitiveservices.speech as speechsdk
    import wave
    import json
    import logging
    import traceback
    
    speech_key = "<speech_key>"
    service_region = "<speech_region>"
    audio_filename = "path/to/file.wav"
    
    def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
        print('Canceled event')
    
    def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
        print('SessionStopped event')
    
    def conversation_transcriber_transcribed_cb(channel, evt: speechsdk.SpeechRecognitionEventArgs):
        try:
            print('TRANSCRIBED:')
            result_json = evt.result.json
            result_data = json.loads(result_json)
            result_data["Channel"] = channel
            modified_result_json = json.dumps(result_data)
            print(modified_result_json)
            
            display_text = result_data.get("DisplayText", "")
            print(f"Manually Tracked Channel: {channel}, Text: {display_text}")
            
        except Exception:
            print(traceback.format_exc())
            logging.error(traceback.format_exc())
    
    def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
        print('SessionStarted event')
    
    def recognize_from_audio_file(wav_data, channel):
        print(f"Processing channel {channel}...")
    
        speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
        speech_config.speech_recognition_language = "en-US"
        audio_input_stream = speechsdk.audio.PushAudioInputStream()
        audio_config = speechsdk.audio.AudioConfig(stream=audio_input_stream)
        conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)
        transcribing_stop = False
    
        def stop_cb(evt):
            print('CLOSING on {}'.format(evt))
            nonlocal transcribing_stop
            transcribing_stop = True
            
        conversation_transcriber.transcribed.connect(lambda evt: conversation_transcriber_transcribed_cb(channel, evt))
        conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
        conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
        conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
        conversation_transcriber.session_stopped.connect(stop_cb)
        conversation_transcriber.canceled.connect(stop_cb)
        conversation_transcriber.start_transcribing_async()
    
        audio_input_stream.write(wav_data)
        audio_input_stream.close()
        while not transcribing_stop:
            pass
        conversation_transcriber.stop_transcribing_async()
    
    def main():
        wf = wave.open(audio_filename, 'rb')
        try:
            stereo_audio = wf.readframes(wf.getnframes())
            mono_channel_1 = bytearray()
            mono_channel_2 = bytearray()
            
            for i in range(0, len(stereo_audio), 4):
                mono_channel_1.extend(stereo_audio[i:i+2])
                mono_channel_2.extend(stereo_audio[i+2:i+4])
            recognize_from_audio_file(bytes(mono_channel_1), channel=1)
            recognize_from_audio_file(bytes(mono_channel_2), channel=2)
    
        except Exception as e:
            print("Error:", e)
        finally:
            wf.close()
    
    if __name__ == "__main__":
        main()
    

    Output :

    enter image description here