I have been working with the Realtime API and Twilio for python example mentioned here: https://www.twilio.com/en-us/blog/voice-ai-assistant-openai-realtime-api-python
I basically want to store an mp3 of the recording, so I have extended my twiml creation to:
response = VoiceResponse()
response.say("Please wait while we connect your call to Pony")
response.pause(length=5)
response.say("O.K. you can start talking!")
host = request.url.hostname
start = Start()
start.stream(url=f'wss://{host}/store-stream', track='both_tracks')
response.append(start)
connect = Connect()
connect.stream(url=f'wss://{host}/media-stream')
response.append(connect)
return HTMLResponse(content=str(response), media_type="application/xml")
With the second stream, I'll be able to receive both the tracks (user and Realtime API) and create a .wav or .mp3 file.
I'm having the following code, but the mp3 is of very low quality and in slow motion.
@app.websocket("/store-stream")
async def handle_store_stream(ws: WebSocket):
await ws.accept()
pcm_data = bytearray()
try:
async for message in ws.iter_text():
data = json.loads(message)
event = data['event']
if event == "media":
# Decode the Base64 payload to raw G.711 µ-law bytes
payload = data["media"]["payload"]
ulaw_chunk = base64.b64decode(payload)
# Convert µ-law to linear PCM (16-bit)
pcm_chunk = audioop.ulaw2lin(ulaw_chunk, 2)
pcm_data.extend(pcm_chunk)
if event == "stop":
audio_segment = AudioSegment(
data=bytes(pcm_data),
sample_width=2, # 16-bit audio
frame_rate=8000, # G.711 µ-law is usually sampled at 8 kHz
channels=1 # mono audio
)
audio_segment.export('audio.mp3', format="mp3")
break
I have tried many different settings, but nothing seems to improve the result of the output mp3.
This store-stream receives both inbound and outbound tracks separately, so you should store them separately and then overlay them if you want the "complete" recording.
Something like this:
async def handle_messages(self):
"""Handle messages from Twilio WebSocket."""
try:
async for message in self.twilio_ws.iter_text():
event = json.loads(message)
event_type = event["event"]
if event_type == "start":
self.stream_id = event["start"]["streamSid"]
self.call_id = event["start"]["callSid"]
self.account_id = event["start"]["accountSid"]
if event_type == "media":
# Decode the Base64 payload to raw G.711 µ-law bytes
payload = event["media"]["payload"]
ulaw_chunk = base64.b64decode(payload)
# Convert µ-law to linear PCM (16-bit)
pcm_chunk = audioop.ulaw2lin(ulaw_chunk, 2) # 2 means 16-bit PCM
self.pcm_data[event["media"]["track"]].extend(pcm_chunk)
if event_type == "stop":
self.export_audio()
break
def export_audio(self):
"""Export the recorded audio to an MP3 file."""
try:
audio_segments = {}
for track in self.pcm_data:
audio_segment = AudioSegment(
data=bytes(self.pcm_data[track]),
sample_width=2, # 16-bit audio
frame_rate=8000, # G.711 µ-law is usually sampled at 8 kHz
channels=1, # mono audio
)
audio_segments[track] = audio_segment
combined_audio = audio_segments["inbound"].overlay(
audio_segments["outbound"]
)
combined_audio.export(
os.path.join(self.export_path, f"{self.call_id}_twilio.mp3"),
format="mp3",
)