I'm using Python's SpeechRecognition to geenerate captions for a livestream. I noticed that when I listen to mic input, recognizer would need a couple of seconds of silence in order to stop capturing audio. Is there a way to reduce that amount of silence needed to say .5 seconds? I'm open to using other methods/libraries, as long as it's not something too low level.
Here's my code so far:
import speech_recognition as sr
from googletrans import Translator
import threading
# Config
OUTPUT_FILE_NAME = "transcription.txt"
def listen(recognizer, microphone):
with microphone as source:
audio = recognizer.listen(source)
return audio
def transcribe(audio, recognizer, translator):
uk_text = recognizer.recognize_google(audio, language="uk-UA")
translated_text = translator.translate(uk_text, src="uk", dest="en")
write_to_file(OUTPUT_FILE_NAME, translated_text.text)
except sr.UnknownValueError:
print("Could not understand audio.")
write_to_file(OUTPUT_FILE_NAME, "")
except sr.RequestError as e:
print(f"Error occurred during recognition: {e}")
def write_to_file(file_path, text):
with open(file_path, "w", encoding="utf-8") as file:
def get_mic():
for index, source in enumerate(sr.Microphone.list_microphone_names()):
print(f"{index}: {source}")
while True:
index = input("Select an index from the list above: ")
return int(index)
except ValueError:
print("Invalid index")
if __name__ == "__main__":
mic_index = get_mic()
translator = Translator()
recognizer = sr.Recognizer()
microphone = sr.Microphone(device_index=mic_index)
print("Adjusting for ambient noise, please don't say anything...")
with microphone as source:
while True:
audio = listen(recognizer, microphone)
transcription_thread = threading.Thread(
except KeyboardInterrupt:
print("\nShutting down recognition service...")
write_to_file("transcription.txt", "Recognition service inactive. This is sample text.")
I've tried using phrase_time_limit
on a .listen()
function, but that's not what I'm looking for, as it would sometimes cut me off in the middle of the word.
From the source code of the SpeechRecognition
library, the parameter you need is pause_threshold
, which is a parameter taken by the Recognizer
self.pause_threshold = 0.8
# seconds of non-speaking audio before a phrase is considered complete
In your code above, it would be passed like:
recognizer = sr.Recognizer(pause_threshold=0.5) # or other value
Try experimenting with the pause_threshold