I am currently trying to write a python script that recognizes speech from a microphone using continuous recognition. I used the sample code from the Azure speech service (https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python). However, my program never exits the while loop. How can I stop the recognition without typing in a command? Is it possible to stop the continuous speech recognition with a verbal command (e.g. a long pause or saying a keyword) ? I am trying to build a voicebot. Am I correct that in order to interact with the voicebot, users should either speak < 15 s (using single-shot recognition) or interact with the device after every utterance (using continuous recognition) ? Thank you!
duplicate question without reply: https://learn.microsoft.com/en-us/answers/questions/1850234/stop-continuous-speech-recognition-from-microphone?comment=question#newest-question-comment
code:
import time
from dotenv import dotenv_values
import azure.cognitiveservices.speech as speechsdk
def recognised_speech(evt):
print(f"You: {evt.result.text}")
def cont_speech_to_text():
done_talking=False
def stop_cb(evt):
print('You: {}'.format(evt))
nonlocal done_talking
done_talking = True
speech_recognizer.stop_continuous_recognition()
#speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(recognised_speech)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done_talking:
time.sleep(.5)
SPEECH_REGION = "westeurope"
keypath="..."
speechkey=dotenv_values(keypath+".key")
speech_config = speechsdk.SpeechConfig(subscription=speechkey['KEY'], region=SPEECH_REGION)
speech_config.speech_recognition_language="en-US"
speech_config.speech_synthesis_voice_name='en-US-AvaMultilingualNeural'
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
cont_speech_to_text()
'''
Yes you can exit the loop or stop the recognition on voice command or long pause.
def cont_speech_to_text():
done_talking=False
def recognised_speech(evt):
print(f"You: {evt.result.text}")
if "stop listening" in evt.result.text.lower():
speech_recognizer.stop_continuous_recognition()
def stop_cb(evt):
print('CLOSING {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
nonlocal done_talking
done_talking = True
#speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(recognised_speech)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done_talking:
time.sleep(.5)
SPEECH_REGION = "eastus"
speech_config = speechsdk.SpeechConfig(subscription="xxxxxxx", region=SPEECH_REGION)
speech_config.speech_recognition_language="en-US"
speech_config.speech_synthesis_voice_name='en-US-AvaMultilingualNeural'
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
Output:
SESSION STARTED: SessionEventArgs(session_id=ee53dddc86fc4142ab6a3dddf03c5b7e)
You: Stop listening.
SESSION STOPPED SessionEventArgs(session_id=ee53dddc86fc4142ab6a3dddf03c5b7e)
CLOSING SessionEventArgs(session_id=ee53dddc86fc4142ab6a3dddf03c5b7e)
Here, I have used stop listening as voice command to stop continuous speech recognition, but you can use whatever you want.
Code
def cont_speech_to_text():
done_talking=False
last_recognized_time = time.time()
pause_threshold = 10
def recognised_speech(evt):
print(f"You: {evt.result.text}")
nonlocal last_recognized_time
last_recognized_time = time.time()
# if "stop listening" in evt.result.text.lower():
# speech_recognizer.stop_continuous_recognition()
def stop_cb(evt):
print('CLOSING {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
nonlocal done_talking
done_talking = True
#speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(recognised_speech)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done_talking:
if time.time() - last_recognized_time > pause_threshold:
print("Stopping due to long pause...")
speech_recognizer.stop_continuous_recognition()
time.sleep(.5)
SPEECH_REGION = "eastus"
speech_config = speechsdk.SpeechConfig(subscription="xxcxxxx", region=SPEECH_REGION)
speech_config.speech_recognition_language="en-US"
speech_config.speech_synthesis_voice_name='en-US-AvaMultilingualNeural'
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
Output:
SESSION STARTED: SessionEventArgs(session_id=ef6dc34bd7bd4e0b81ae9a6d9f4363c1)
You: Hi.
Stopping due to long pause...
SESSION STOPPED SessionEventArgs(session_id=ef6dc34bd7bd4e0b81ae9a6d9f4363c1)
CLOSING SessionEventArgs(session_id=ef6dc34bd7bd4e0b81ae9a6d9f4363c1)
Here, i calculating time at every speech and checking the time difference in while loop with pause threshold 10 seconds.
Or you can use both voice command and pause time to stop continuous speech recognition, code is given above just uncomment the if condition in recognised_speech function.