I'm using Solara to build a web app with Python, and I can use ipywebrtc to capture audio from the client's browser. I can first record the audio to a temporary file and then pass it to Azure Speech, but I need it to be streaming
At first, I tried with this code
from ipywebrtc import CameraStream, AudioRecorder
from azure.cognitiveservices.speech.audio import AudioInputStream
import azure.cognitiveservices.speech as speechsdk
...
speech_config = speechsdk.SpeechConfig(subscription="...", region="brazilsouth")
camera = CameraStream(constraints={'audio': True, 'video': False})
recorder = AudioRecorder(stream=camera)
stream = AudioInputStream(recorder.audio)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config,audio_config=audio_config)
...
Exception in thread Thread-22 (recognize_from_device):
Traceback (most recent call last):
File "C:\Python312\Lib\threading.py", line 1073, in _bootstrap_inner
self.run()
File "C:\Python312\Lib\threading.py", line 1010, in run
self._target(*self._args, **self._kwargs)
File "L:\projects\testes\functions\ouvir_microfone.py", line 62, in recognize_from_device
audio_config = speechsdk.audio.AudioConfig(stream=stream)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "L:\projects\testes\venv\Lib\site-packages\azure\cognitiveservices\speech\audio.py", line 382, in __init__
_call_hr_fn(fn=_sdk_lib.audio_config_create_audio_input_from_stream, *[ctypes.byref(handle), stream._handle])
File "L:\projects\testes\venv\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 61, in _call_hr_fn
hr = fn(*args) if len(args) > 0 else fn()
^^^^^^^^^
ctypes.ArgumentError: argument 2: TypeError: Don't know how to convert parameter 2
Exception ignored in: <function _Handle.__del__ at 0x00000140E12B5E40>
Traceback (most recent call last):
File "L:\projects\testes\venv\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 105, in __del__
elif self.__test_fn(self.__handle):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ctypes.ArgumentError: argument 1: TypeError: Don't know how to convert parameter 1
changing AudioInputStream(recorder.audio)
to AudioInputStream(recorder.codecs)
:
RuntimeError: Exception with error code:
[CALL STACK BEGIN]
> GetModuleObject
- audio_config_get_audio_processing_options
- audio_config_create_audio_input_from_stream
- ffi_prep_go_closure
- ffi_call_go
- ffi_call
- 00007FF910133DD5 (SymFromAddr() error: Attempt to access invalid address.)
- 00007FF910132D33 (SymFromAddr() error: Attempt to access invalid address.)
- 00007FF910132928 (SymFromAddr() error: Attempt to access invalid address.)
- PyObject_Call
- PyEval_EvalFrameDefault
- PyFunction_Vectorcall
- PyObject_VectorcallMethod
- PyObject_Vectorcall
- PyObject_Vectorcall
- PyEval_EvalFrameDefault
[CALL STACK END]
Exception with an error code: 0x5 (SPXERR_INVALID_ARG)
I could capture audio from the web browser using Solara, save it to a temporary file, and stream it directly to Azure Speech for continuous recognition.
Code :
app.py :
from flask import Flask, render_template, jsonify
import azure.cognitiveservices.speech as speechsdk
from threading import Timer
import solara
import pyaudio
import wave
import threading
app = Flask(__name__)
subscription_key = "<speech_key>"
service_region = "<speech_region>"
speech_config = speechsdk.SpeechConfig(subscription=subscription_key, region=service_region)
audio_config = speechsdk.AudioConfig(use_default_microphone=True)
chunks = []
recognition_running = False
recognizer = None
stop_timer = None
INACTIVITY_TIMEOUT = 5
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
WAVE_OUTPUT_FILENAME = "output.wav"
audio = pyaudio.PyAudio()
frames = []
def recognized_callback(evt):
global chunks, stop_timer
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
chunks.append(evt.result.text)
print(f"Recognized: {evt.result.text}")
reset_stop_timer()
def reset_stop_timer():
global stop_timer
if stop_timer:
stop_timer.cancel()
stop_timer = Timer(INACTIVITY_TIMEOUT, stop_recognition)
stop_timer.start()
def start_recognition():
global recognition_running, recognizer, audio_stream, frames
if not recognition_running:
recognition_running = True
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
recognizer.recognized.connect(recognized_callback)
recognizer.start_continuous_recognition()
print("Recognition started")
frames = []
audio_stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
threading.Thread(target=record_audio).start()
def record_audio():
global audio_stream, frames, recognition_running
while recognition_running:
data = audio_stream.read(CHUNK)
frames.append(data)
def stop_recognition():
global recognition_running, recognizer, audio_stream, frames
if recognition_running:
recognizer.stop_continuous_recognition()
recognition_running = False
print("Recognition stopped")
audio_stream.stop_stream()
audio_stream.close()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
print(f"Audio saved as {WAVE_OUTPUT_FILENAME}")
@app.route('/')
def index():
return render_template('index.html')
@app.route('/start_recognition')
def start_recognition_route():
start_recognition()
return jsonify({'status': 'started'})
@solara.component
def MainPage():
return solara.Column(
[
solara.Button("Start Recognition", on_click=start_recognition),
solara.Text(f"Recognized: {', '.join(chunks)}")
]
)
if __name__ == "__main__":
app.run(debug=True)
templates/index.html :
<!DOCTYPE html>
<html>
<head>
<title>Azure Speech Service with Flask</title>
</head>
<body>
<h1>Speak into your microphone</h1>
<button onclick="startRecognition()">Start Recognition</button>
<pre id="recognized_text"></pre>
<script>
let recognitionStarted = false;
function startRecognition() {
fetch('/start_recognition')
.then(response => response.json())
.then(data => {
console.log('Recognition started');
recognitionStarted = true;
});
}
function handleRecognitionResults() {
}
</script>
</body>
</html>
Output :
I got the below output in the browser with the output URL. I clicked on Start Recognition button and spoke some sentences.
VS code terminal output :
The audio from the web browser was saved to a .wav file and got the streamed text output with the audio in the vs code terminal as shown below.