with the below code, the result ends up just being always the same "Thank you." Any ideas what could be going wrong. For reference, I used the load_audio function in the whisper package. And an article about twilio and Vosk.
@sock.route('/stream')
def stream(ws):
while True:
message = ws.receive()
packet = json.loads(message)
if packet['event'] == 'media':
# get audio as ulaw
audio = base64.b64decode(packet['media']['payload'])
add_audio(audio)
buffer = np.array([], dtype=np.float32)
def add_audio(audio):
global buffer
# convert audio to numpy array
audio = np.frombuffer(audio, np.int16).flatten().astype(np.float32) / 32768.0
buffer = np.concatenate((buffer, audio))
process_audio()
def process_audio():
global buffer
audio = whisper.pad_or_trim(buffer)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
mel = torch.nan_to_num(mel)
result = whisper.decode(model, mel, options)
Andreas' answer below is absolutely correct. Unfortunately, it's downvoted because it's not in English. Here's the updated code snippet to consume Twilio Media Stream to generate transcriptions with OpenAI API.
audio_data2 = b''
transcription0 = ''
i = 0
@app.websocket("/v1/twilio/stream")
async def websocket_endpoint(websocket: WebSocket):
import base64
import json
import threading
import audioop
import wave
import os
from openai import OpenAI
from os import environ as env
client = OpenAI(api_key=env["OPENAI_API_KEY"])
global audio_data2
global transcription0
global i
await websocket.accept()
has_seen_media = False
message_count = 0
print("Connected to WebSocket")
try:
while True:
message = await websocket.receive_text()
if message is None:
print("No message received...")
continue
if isinstance(message, str):
data = json.loads(message)
else:
print("Message is not a string")
data = message
if data['event'] == "connected":
print("Connected Message received: {}".format(message))
if data['event'] == "start":
print("Start Message received: {}".format(message))
if data['event'] == "closed":
print("Closed Message received: {}".format(message))
break
if data['event'] == "media":
i = i + 1
payload = data['media']['payload']
audio_data = base64.b64decode(payload)
audio_data = audioop.ulaw2lin(audio_data, 2)
audio_data = audioop.ratecv(audio_data, 2, 1, 8000, 16000, None)[0]
audio_data2 = audio_data2 + audio_data
if len(audio_data2) > 299999:
sondosiero = 'sono' + str(i) + '.wav'
with wave.open(sondosiero, 'w') as wavfile:
wavfile.setnchannels(1)
wavfile.setsampwidth(2)
wavfile.setframerate(16000)
wavfile.writeframes(audio_data2)
wavfile.close()
audio_file3 = open(sondosiero, "rb")
try:
transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_file3)
if (transcription != transcription0):
print(transcription.text + ' ')
transcription0 = transcription
audio_file3.close()
except:
pass
os.remove(sondosiero)
audio_data2 = b''
message_count += 1
except WebSocketDisconnect:
print(f"Connection closed. Transcription is {transcription0}. Received a total of {message_count} messages")
If you are using this in production, you will need to handle the last portion of the audio that is not within the len(audio_data2)
.