streamtwiliomediatwilio-twimlrealtime-api

Store mp3 from media-stream


I have been working with the Realtime API and Twilio for python example mentioned here: https://www.twilio.com/en-us/blog/voice-ai-assistant-openai-realtime-api-python

I basically want to store an mp3 of the recording, so I have extended my twiml creation to:

    response = VoiceResponse()
    response.say("Please wait while we connect your call to Pony")
    response.pause(length=5)
    response.say("O.K. you can start talking!")
    host = request.url.hostname
    start = Start()
    start.stream(url=f'wss://{host}/store-stream', track='both_tracks')
    response.append(start)
    connect = Connect()
    connect.stream(url=f'wss://{host}/media-stream')
    response.append(connect)
    return HTMLResponse(content=str(response), media_type="application/xml")

With the second stream, I'll be able to receive both the tracks (user and Realtime API) and create a .wav or .mp3 file.

I'm having the following code, but the mp3 is of very low quality and in slow motion.

@app.websocket("/store-stream")
async def handle_store_stream(ws: WebSocket):
    await ws.accept()
    pcm_data = bytearray()
    try:
        async for message in ws.iter_text():
            data = json.loads(message)
            event = data['event']
            if event == "media":
                # Decode the Base64 payload to raw G.711 µ-law bytes
                payload = data["media"]["payload"]
                ulaw_chunk = base64.b64decode(payload)
                
                # Convert µ-law to linear PCM (16-bit)
                pcm_chunk = audioop.ulaw2lin(ulaw_chunk, 2)
                
                pcm_data.extend(pcm_chunk)
            if event == "stop":
                audio_segment = AudioSegment(
                    data=bytes(pcm_data),
                    sample_width=2,  # 16-bit audio
                    frame_rate=8000,  # G.711 µ-law is usually sampled at 8 kHz
                    channels=1  # mono audio
                )
                audio_segment.export('audio.mp3', format="mp3")
                break

I have tried many different settings, but nothing seems to improve the result of the output mp3.


Solution

  • This store-stream receives both inbound and outbound tracks separately, so you should store them separately and then overlay them if you want the "complete" recording.

    Something like this:

        async def handle_messages(self):
            """Handle messages from Twilio WebSocket."""
            try:
                async for message in self.twilio_ws.iter_text():
                    event = json.loads(message)
                    event_type = event["event"]
                    if event_type == "start":
                        self.stream_id = event["start"]["streamSid"]
                        self.call_id = event["start"]["callSid"]
                        self.account_id = event["start"]["accountSid"]
                    if event_type == "media":
                        # Decode the Base64 payload to raw G.711 µ-law bytes
                        payload = event["media"]["payload"]
                        ulaw_chunk = base64.b64decode(payload)
    
                        # Convert µ-law to linear PCM (16-bit)
                        pcm_chunk = audioop.ulaw2lin(ulaw_chunk, 2)  # 2 means 16-bit PCM
    
                        self.pcm_data[event["media"]["track"]].extend(pcm_chunk)
                    if event_type == "stop":
                        self.export_audio()
                        break
    
    
        def export_audio(self):
            """Export the recorded audio to an MP3 file."""
            try:
                audio_segments = {}
                for track in self.pcm_data:
                    audio_segment = AudioSegment(
                        data=bytes(self.pcm_data[track]),
                        sample_width=2,  # 16-bit audio
                        frame_rate=8000,  # G.711 µ-law is usually sampled at 8 kHz
                        channels=1,  # mono audio
                    )
                    audio_segments[track] = audio_segment
                combined_audio = audio_segments["inbound"].overlay(
                    audio_segments["outbound"]
                )
                combined_audio.export(
                    os.path.join(self.export_path, f"{self.call_id}_twilio.mp3"),
                    format="mp3",
                )