I've been trying to implement a real-time speech recognition app for system audio using python but can't really figure out how to do both simultaneously. I'm able to capture the system audio but since the transcription takes time, I'm unable to record the audio when I'm transcribing the text. I tried exploring multi-threading or client-server architecture but couldn't make it work (maybe due to my poor coding abilities.). I would appreciate if anyone could help me figure this out. Here is a code sample for the multi-threading code I came up with. I know this is not the best way to do it. But, I couldn't really find any references that would help me figure out how to implement this.
import soundcard as sc
import soundfile as sf
import threading
import os
import speech_recognition as sr
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
audio_file = os.path.join(CURRENT_DIR, "output.wav")
seconds = 5
speech = sr.Recognizer()
def record_buffer():
while True:
with sc.get_microphone(id=str(sc.default_speaker().name), include_loopback=True).recorder(samplerate=44100) as mic:
audio = mic.record(numframes=44100 * seconds)
sf.write(audio_file, audio, 44100)
thread = threading.Thread(target=record_buffer)
thread.start()
while True:
time.sleep(2)
with sr.AudioFile(audio_file) as source:
audio = speech.record(source)
text = speech.recognize_google(audio)
print(text)
You can use queue
to handle the communication between the two threads, try this:
import soundcard as sc
import soundfile as sf
import threading
import os
import queue
import speech_recognition as sr
import time
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
audio_file = os.path.join(CURRENT_DIR, "output.wav")
seconds = 5
speech = sr.Recognizer()
audio_buffer = queue.Queue()
def record_buffer():
while True:
with sc.get_microphone(id=str(sc.default_speaker().name), include_loopback=True).recorder(samplerate=44100) as mic:
audio = mic.record(numframes=44100 * seconds)
audio_buffer.put(audio)
def transcribe_buffer():
while True:
audio = audio_buffer.get()
sf.write(audio_file, audio, 44100)
with sr.AudioFile(audio_file) as source:
audio = speech.record(source)
text = speech.recognize_google(audio)
print(text)
record_thread = threading.Thread(target=record_buffer)
transcribe_thread = threading.Thread(target=transcribe_buffer)
record_thread.start()
transcribe_thread.start()
Use the audio_buffer
queue to store the captured audio chunks from the record_buffer()
function. The transcribe_buffer()
function reads audio from the buffer, saves it to a file, transcribes it, and prints the recognized text.
Probably isn't the best implementation, but it helps.