pythonpython-3.xstreamlitazure-speech

Python streamlit realtime speech-to-text with azure SDK


Hello I'm trying to create a real-time speech to text using streamlit and azure speech SDK. I can easilly transcribe audio/video files with no issues, but I want to integrate realtime transcription (from browser).

In the code I've tried, when I speak in the microphone, no sentence is transcribed

I've also tried to reuse the function I've created that uses files, pass the AudioStream and turn it async but didn't work it either.

The guided path: https://microsoft.github.io/TechExcel-Implementing-automation-practices-using-Azure-OpenAI/docs/04_implement_audio_transcription/0402.html

Only works on local machine because it uses host's microphone

I've tried with the code posted below, searched on Google and asked AI. I want the user to be able to start live speech-to-text with live transcription in the chat (speaker recognition must stay)

EDIT 19/3: By using pydub I can now save and listen the .wav file, only need to pass the stream to the speech SDK

Edited Code:

def addsentence(evt: ConversationTranscriptionEventArgs):
    if evt.result.speaker_id == "Unknown":
        logger.debug("Unknown speaker: " + str(evt))
        return
    logger.info(f"Detected **{evt.result.speaker_id}**: {evt.result.text}")
    st.session_state.r.append(f"**{evt.result.speaker_id}**: {evt.result.text}")
webrtc_ctx = webrtc_streamer(key="speech-to-text", mode=WebRtcMode.SENDONLY,
        media_stream_constraints={"video": False, "audio": True},
        audio_receiver_size=256)

while webrtc_ctx.state.playing:
    if not st.session_state["recording"]:
        st.session_state.r = []

        st.session_state.stream = PushAudioInputStream()
        ###
        audio_input = speechsdk.AudioConfig(stream=st.session_state.stream)
        speech_config = speechsdk.SpeechConfig(env["SPEECH_KEY"], env["SPEECH_REGION"])
        if "proxy_host" in env and "proxy_port" in env:
            speech_config.set_proxy(env["proxy_host"], int(env["proxy_port"]))
        conversation_transcriber = ConversationTranscriber(speech_config, audio_input, language="it-IT")

        conversation_transcriber.transcribed.connect(addsentence)
        ###

        st.session_state.fullwav = pydub.AudioSegment.empty()
        with (st.chat_message("assistant")):
            with st.spinner("Trascrizione in corso..."):
                stream_placeholder = st.expander("Trascrizione", icon="📝").empty()

        conversation_transcriber.start_transcribing_async()
        logger.info("Transcribing started!")
        st.session_state["recording"] = True

    try:
        audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
    except queue.Empty:
        time.sleep(0.1)
        logger.debug("No frame arrived.")
        continue

    stream_placeholder.markdown("## Trascrizione:\n\n" + "\\\n".join(st.session_state.r))

    for audio_frame in audio_frames:
        st.session_state.stream.write(audio_frame.to_ndarray().tobytes())
        sound = pydub.AudioSegment(
            data=audio_frame.to_ndarray().tobytes(),
            sample_width=audio_frame.format.bytes,
            frame_rate=audio_frame.sample_rate,
            channels=len(audio_frame.layout.channels),
        )
        st.session_state.fullwav += sound

if st.session_state["recording"]:
    logger.info("stopped listening")
    wav_file_path= tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
    st.session_state.fullwav.export(wav_file_path, format="wav")

EDIT 28/3 I rolled back to PushAudioInputStream, I was able to process rtmp using ffmpeg

    def transcribe_rmtp(self, rtmp_url: str) -> str:
        push_stream = PushAudioInputStream()
        audio_config = AudioConfig(stream=push_stream)
        transcriber = self.setup_transcriber(audio_config)
        transcriber.start_transcribing_async()

        ffmpeg_args = [
            "ffmpeg", "-i", rtmp_url, "-vn", "-ac", "1", "-ar", "16000",
            "-f", "s16le", "-fflags", "+genpts", "-bufsize", "512k",
            "-maxrate", "128k", "pipe:1"]
        ffmpeg_process = subprocess.Popen(ffmpeg_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)

        try:
            while not self.done:
                if self.on_transcribed:
                    self.on_transcribed("\\\n".join(self.results))
                chunk = ffmpeg_process.stdout.read(4096)
                if not chunk:
                    break
                push_stream.write(chunk)
                time.sleep(0.1)
        except Exception as e:
            logger.error("Errore durante lo streaming RTMP: %s", e)
        finally:
            push_stream.close()
            ffmpeg_process.kill()
            transcriber.stop_transcribing_async()

        return "\\\n".join(self.results)

Solution

  • I've found a working solution

        def transcribe_webrtc(self, webrtc_ctx: WebRtcStreamerContext) -> str:
            push_stream = PushAudioInputStream()
            audio_config = AudioConfig(stream=push_stream)
            transcriber = self.setup_transcriber(audio_config)
            transcriber.start_transcribing_async()
            logger.info("Started WebRTC transcription")
    
            try:
                while webrtc_ctx.state.playing:
                    audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
                    if not audio_frames:
                        logger.debug("No audio frames received")
                        continue
    
                    frame = pydub.AudioSegment.empty()
                    for audio_frame in audio_frames:
                        sound = pydub.AudioSegment(
                            data=audio_frame.to_ndarray().tobytes(),
                            sample_width=audio_frame.format.bytes,
                            frame_rate=audio_frame.sample_rate,
                            channels=len(audio_frame.layout.channels),
                        )
                        frame += sound
    
                    if len(frame) > 0:
                        logger.debug(f"Processing audio frame of length {len(frame.raw_data)} bytes")
                        frame= frame.set_channels(1).set_frame_rate(16000)
                        push_stream.write(frame.raw_data)
    
                    if self.on_transcribed:
                        self.on_transcribed("\\\n".join(self.results))
                    time.sleep(0.1)