I am using Azure's Speech to text services for transcribing the audio stream. I am able to achieve the results with Diarization as well. However, service has challenges to identify the speaker names correctly.
I have now chosen to take the multi channel audio stream (in this case 2 channel) and would like to differentiate the text coming from different channel. This way, I felt I can achieve the solution for correct speaker names. As I know, Speaker 1 always speaks through channel 1 and speaker 2 through channel 2.
But I dont see the response showing the results with separate channel name. Irrespective of channel from which speech is converted, the result shows as 'Channel 0'.
Can someone help me to get the result so that I can differentiate text based on 'Channel' tag or some other means.
{"Id":"e84432bb0b3a4283b10914fc528ddbc9","RecognitionStatus":"Success","DisplayText":"SOME TEXT HERE.","Offset":733900000,"Duration":74800000,"Channel":0,"Type":"ConversationTranscription","SpeakerId":"Guest-1"}
{"Id":"33b6600f7ca44f3db4a0fa59be17d403","RecognitionStatus":"Success","DisplayText":"SOME TEXT HERE.","Offset":808700000,"Duration":76800000,"Channel":0,"Type":"ConversationTranscription","SpeakerId":"Guest-2"}
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
print('Canceled event')
def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
print('SessionStopped event')
def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
try:
print('TRANSCRIBED:')
print(evt.result.json)
except Exception:
print(traceback.format_exc())
logging.error(traceback.format_exc())
def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
print('SessionStarted event')
def recognize_from_file():
speech_config = speechsdk.SpeechConfig(
subscription=speech_key, region=service_region)
speech_config.speech_recognition_language = "en-US"
audio_config = speechsdk.audio.AudioConfig(
filename=audio_filename)
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
speech_config=speech_config, audio_config=audio_config)
transcribing_stop = False
def stop_cb(evt: speechsdk.SessionEventArgs):
# """callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal transcribing_stop
transcribing_stop = True
# Connect callbacks to the events fired by the conversation transcriber
conversation_transcriber.transcribed.connect(
conversation_transcriber_transcribed_cb)
conversation_transcriber.session_started.connect(
conversation_transcriber_session_started_cb)
conversation_transcriber.session_stopped.connect(
conversation_transcriber_session_stopped_cb)
conversation_transcriber.canceled.connect(
conversation_transcriber_recognition_canceled_cb)
# stop transcribing on either session stopped or canceled events
conversation_transcriber.session_stopped.connect(stop_cb)
conversation_transcriber.canceled.connect(stop_cb)
conversation_transcriber.start_transcribing_async()
# Waits for completion.
while not transcribing_stop:
time.sleep(10)
conversation_transcriber.stop_transcribing_async()
To obtain the correct channel numbers (1 and 2) instead of channel 0 in the JSON output, we must manually modify the JSON data after parsing it to replace the channel number. Here is the modified complete code.
Code :
import azure.cognitiveservices.speech as speechsdk
import wave
import json
import logging
import traceback
speech_key = "<speech_key>"
service_region = "<speech_region>"
audio_filename = "path/to/file.wav"
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
print('Canceled event')
def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
print('SessionStopped event')
def conversation_transcriber_transcribed_cb(channel, evt: speechsdk.SpeechRecognitionEventArgs):
try:
print('TRANSCRIBED:')
result_json = evt.result.json
result_data = json.loads(result_json)
result_data["Channel"] = channel
modified_result_json = json.dumps(result_data)
print(modified_result_json)
display_text = result_data.get("DisplayText", "")
print(f"Manually Tracked Channel: {channel}, Text: {display_text}")
except Exception:
print(traceback.format_exc())
logging.error(traceback.format_exc())
def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
print('SessionStarted event')
def recognize_from_audio_file(wav_data, channel):
print(f"Processing channel {channel}...")
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.speech_recognition_language = "en-US"
audio_input_stream = speechsdk.audio.PushAudioInputStream()
audio_config = speechsdk.audio.AudioConfig(stream=audio_input_stream)
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)
transcribing_stop = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal transcribing_stop
transcribing_stop = True
conversation_transcriber.transcribed.connect(lambda evt: conversation_transcriber_transcribed_cb(channel, evt))
conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
conversation_transcriber.session_stopped.connect(stop_cb)
conversation_transcriber.canceled.connect(stop_cb)
conversation_transcriber.start_transcribing_async()
audio_input_stream.write(wav_data)
audio_input_stream.close()
while not transcribing_stop:
pass
conversation_transcriber.stop_transcribing_async()
def main():
wf = wave.open(audio_filename, 'rb')
try:
stereo_audio = wf.readframes(wf.getnframes())
mono_channel_1 = bytearray()
mono_channel_2 = bytearray()
for i in range(0, len(stereo_audio), 4):
mono_channel_1.extend(stereo_audio[i:i+2])
mono_channel_2.extend(stereo_audio[i+2:i+4])
recognize_from_audio_file(bytes(mono_channel_1), channel=1)
recognize_from_audio_file(bytes(mono_channel_2), channel=2)
except Exception as e:
print("Error:", e)
finally:
wf.close()
if __name__ == "__main__":
main()
Output :