Hello I'm trying to create a real-time speech to text using streamlit and azure speech SDK. I can easilly transcribe audio/video files with no issues, but I want to integrate realtime transcription (from browser).
In the code I've tried, when I speak in the microphone, no sentence is transcribed
I've also tried to reuse the function I've created that uses files, pass the AudioStream and turn it async but didn't work it either.
The guided path: https://microsoft.github.io/TechExcel-Implementing-automation-practices-using-Azure-OpenAI/docs/04_implement_audio_transcription/0402.html
Only works on local machine because it uses host's microphone
I've tried with the code posted below, searched on Google and asked AI. I want the user to be able to start live speech-to-text with live transcription in the chat (speaker recognition must stay)
EDIT 19/3: By using pydub
I can now save and listen the .wav file, only need to pass the stream to the speech SDK
Edited Code:
def addsentence(evt: ConversationTranscriptionEventArgs):
if evt.result.speaker_id == "Unknown":
logger.debug("Unknown speaker: " + str(evt))
return
logger.info(f"Detected **{evt.result.speaker_id}**: {evt.result.text}")
st.session_state.r.append(f"**{evt.result.speaker_id}**: {evt.result.text}")
webrtc_ctx = webrtc_streamer(key="speech-to-text", mode=WebRtcMode.SENDONLY,
media_stream_constraints={"video": False, "audio": True},
audio_receiver_size=256)
while webrtc_ctx.state.playing:
if not st.session_state["recording"]:
st.session_state.r = []
st.session_state.stream = PushAudioInputStream()
###
audio_input = speechsdk.AudioConfig(stream=st.session_state.stream)
speech_config = speechsdk.SpeechConfig(env["SPEECH_KEY"], env["SPEECH_REGION"])
if "proxy_host" in env and "proxy_port" in env:
speech_config.set_proxy(env["proxy_host"], int(env["proxy_port"]))
conversation_transcriber = ConversationTranscriber(speech_config, audio_input, language="it-IT")
conversation_transcriber.transcribed.connect(addsentence)
###
st.session_state.fullwav = pydub.AudioSegment.empty()
with (st.chat_message("assistant")):
with st.spinner("Trascrizione in corso..."):
stream_placeholder = st.expander("Trascrizione", icon="đ").empty()
conversation_transcriber.start_transcribing_async()
logger.info("Transcribing started!")
st.session_state["recording"] = True
try:
audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
except queue.Empty:
time.sleep(0.1)
logger.debug("No frame arrived.")
continue
stream_placeholder.markdown("## Trascrizione:\n\n" + "\\\n".join(st.session_state.r))
for audio_frame in audio_frames:
st.session_state.stream.write(audio_frame.to_ndarray().tobytes())
sound = pydub.AudioSegment(
data=audio_frame.to_ndarray().tobytes(),
sample_width=audio_frame.format.bytes,
frame_rate=audio_frame.sample_rate,
channels=len(audio_frame.layout.channels),
)
st.session_state.fullwav += sound
if st.session_state["recording"]:
logger.info("stopped listening")
wav_file_path= tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
st.session_state.fullwav.export(wav_file_path, format="wav")
EDIT 28/3 I rolled back to PushAudioInputStream, I was able to process rtmp using ffmpeg
def transcribe_rmtp(self, rtmp_url: str) -> str:
push_stream = PushAudioInputStream()
audio_config = AudioConfig(stream=push_stream)
transcriber = self.setup_transcriber(audio_config)
transcriber.start_transcribing_async()
ffmpeg_args = [
"ffmpeg", "-i", rtmp_url, "-vn", "-ac", "1", "-ar", "16000",
"-f", "s16le", "-fflags", "+genpts", "-bufsize", "512k",
"-maxrate", "128k", "pipe:1"]
ffmpeg_process = subprocess.Popen(ffmpeg_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
try:
while not self.done:
if self.on_transcribed:
self.on_transcribed("\\\n".join(self.results))
chunk = ffmpeg_process.stdout.read(4096)
if not chunk:
break
push_stream.write(chunk)
time.sleep(0.1)
except Exception as e:
logger.error("Errore durante lo streaming RTMP: %s", e)
finally:
push_stream.close()
ffmpeg_process.kill()
transcriber.stop_transcribing_async()
return "\\\n".join(self.results)
I've found a working solution
def transcribe_webrtc(self, webrtc_ctx: WebRtcStreamerContext) -> str:
push_stream = PushAudioInputStream()
audio_config = AudioConfig(stream=push_stream)
transcriber = self.setup_transcriber(audio_config)
transcriber.start_transcribing_async()
logger.info("Started WebRTC transcription")
try:
while webrtc_ctx.state.playing:
audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
if not audio_frames:
logger.debug("No audio frames received")
continue
frame = pydub.AudioSegment.empty()
for audio_frame in audio_frames:
sound = pydub.AudioSegment(
data=audio_frame.to_ndarray().tobytes(),
sample_width=audio_frame.format.bytes,
frame_rate=audio_frame.sample_rate,
channels=len(audio_frame.layout.channels),
)
frame += sound
if len(frame) > 0:
logger.debug(f"Processing audio frame of length {len(frame.raw_data)} bytes")
frame= frame.set_channels(1).set_frame_rate(16000)
push_stream.write(frame.raw_data)
if self.on_transcribed:
self.on_transcribed("\\\n".join(self.results))
time.sleep(0.1)