audio-streamingipywidgetsazure-speech

How to use ipywebrtc to stream audio to Azure Speech?


I'm using Solara to build a web app with Python, and I can use ipywebrtc to capture audio from the client's browser. I can first record the audio to a temporary file and then pass it to Azure Speech, but I need it to be streaming

At first, I tried with this code

from ipywebrtc import CameraStream, AudioRecorder
from azure.cognitiveservices.speech.audio import AudioInputStream
import azure.cognitiveservices.speech as speechsdk
...
speech_config = speechsdk.SpeechConfig(subscription="...", region="brazilsouth")

camera = CameraStream(constraints={'audio': True, 'video': False})
recorder = AudioRecorder(stream=camera)
stream = AudioInputStream(recorder.audio)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config,audio_config=audio_config)
...
Exception in thread Thread-22 (recognize_from_device):
Traceback (most recent call last):
  File "C:\Python312\Lib\threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "C:\Python312\Lib\threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "L:\projects\testes\functions\ouvir_microfone.py", line 62, in recognize_from_device
    audio_config = speechsdk.audio.AudioConfig(stream=stream)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "L:\projects\testes\venv\Lib\site-packages\azure\cognitiveservices\speech\audio.py", line 382, in __init__
    _call_hr_fn(fn=_sdk_lib.audio_config_create_audio_input_from_stream, *[ctypes.byref(handle), stream._handle])
  File "L:\projects\testes\venv\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 61, in _call_hr_fn
    hr = fn(*args) if len(args) > 0 else fn()
         ^^^^^^^^^
ctypes.ArgumentError: argument 2: TypeError: Don't know how to convert parameter 2
Exception ignored in: <function _Handle.__del__ at 0x00000140E12B5E40>
Traceback (most recent call last):
  File "L:\projects\testes\venv\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 105, in __del__
    elif self.__test_fn(self.__handle):
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ctypes.ArgumentError: argument 1: TypeError: Don't know how to convert parameter 1

changing AudioInputStream(recorder.audio) to AudioInputStream(recorder.codecs):

RuntimeError: Exception with error code:
[CALL STACK BEGIN]

    > GetModuleObject
    - audio_config_get_audio_processing_options
    - audio_config_create_audio_input_from_stream
    - ffi_prep_go_closure
    - ffi_call_go
    - ffi_call
    - 00007FF910133DD5 (SymFromAddr() error: Attempt to access invalid address.)
    - 00007FF910132D33 (SymFromAddr() error: Attempt to access invalid address.)
    - 00007FF910132928 (SymFromAddr() error: Attempt to access invalid address.)
    - PyObject_Call
    - PyEval_EvalFrameDefault
    - PyFunction_Vectorcall
    - PyObject_VectorcallMethod
    - PyObject_Vectorcall
    - PyObject_Vectorcall
    - PyEval_EvalFrameDefault

[CALL STACK END]

Exception with an error code: 0x5 (SPXERR_INVALID_ARG)

Solution

  • I could capture audio from the web browser using Solara, save it to a temporary file, and stream it directly to Azure Speech for continuous recognition.

    Code :

    app.py :

    from flask import Flask, render_template, jsonify
    import azure.cognitiveservices.speech as speechsdk
    from threading import Timer
    import solara
    import pyaudio
    import wave
    import threading
    
    app = Flask(__name__)
    
    subscription_key = "<speech_key>"
    service_region = "<speech_region>"
    speech_config = speechsdk.SpeechConfig(subscription=subscription_key, region=service_region)
    audio_config = speechsdk.AudioConfig(use_default_microphone=True)
    
    chunks = []
    recognition_running = False
    recognizer = None
    stop_timer = None
    INACTIVITY_TIMEOUT = 5  
    
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    CHUNK = 1024
    WAVE_OUTPUT_FILENAME = "output.wav"
    
    audio = pyaudio.PyAudio()
    frames = []
    
    def recognized_callback(evt):
        global chunks, stop_timer
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            chunks.append(evt.result.text)
            print(f"Recognized: {evt.result.text}")
            reset_stop_timer()
    
    def reset_stop_timer():
        global stop_timer
        if stop_timer:
            stop_timer.cancel()
        stop_timer = Timer(INACTIVITY_TIMEOUT, stop_recognition)
        stop_timer.start()
    
    def start_recognition():
        global recognition_running, recognizer, audio_stream, frames
        if not recognition_running:
            recognition_running = True
            recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
            recognizer.recognized.connect(recognized_callback)
            recognizer.start_continuous_recognition()
            print("Recognition started")
    
            frames = []
            audio_stream = audio.open(format=FORMAT, channels=CHANNELS,
                                      rate=RATE, input=True,
                                      frames_per_buffer=CHUNK)
            threading.Thread(target=record_audio).start()
    
    def record_audio():
        global audio_stream, frames, recognition_running
        while recognition_running:
            data = audio_stream.read(CHUNK)
            frames.append(data)
    
    def stop_recognition():
        global recognition_running, recognizer, audio_stream, frames
        if recognition_running:
            recognizer.stop_continuous_recognition()
            recognition_running = False
            print("Recognition stopped")
    
            audio_stream.stop_stream()
            audio_stream.close()
            wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(audio.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))
            wf.close()
            print(f"Audio saved as {WAVE_OUTPUT_FILENAME}")
    
    @app.route('/')
    def index():
        return render_template('index.html')
    
    @app.route('/start_recognition')
    def start_recognition_route():
        start_recognition()
        return jsonify({'status': 'started'})
    
    @solara.component
    def MainPage():
        return solara.Column(
            [
                solara.Button("Start Recognition", on_click=start_recognition),
                solara.Text(f"Recognized: {', '.join(chunks)}")
            ]
        )
    
    if __name__ == "__main__":
        app.run(debug=True)
    

    templates/index.html :

    <!DOCTYPE html>
    <html>
    <head>
        <title>Azure Speech Service with Flask</title>
    </head>
    <body>
        <h1>Speak into your microphone</h1>
        <button onclick="startRecognition()">Start Recognition</button>
        <pre id="recognized_text"></pre>
        <script>
            let recognitionStarted = false;
            function startRecognition() {
                fetch('/start_recognition')
                    .then(response => response.json())
                    .then(data => {
                        console.log('Recognition started');
                        recognitionStarted = true;
                    });
            }
            function handleRecognitionResults() {
            }
        </script>
    </body>
    </html>
    

    Output :

    I got the below output in the browser with the output URL. I clicked on Start Recognition button and spoke some sentences.

    enter image description here

    VS code terminal output :

    The audio from the web browser was saved to a .wav file and got the streamed text output with the audio in the vs code terminal as shown below.

    enter image description here