For a class, I am trying to stream audio from a ChatGPT API response. The code below mostly works, and I am getting good quality when I play the saved file later, but if I try to do it live it is super choppy:
# Initialize an empty AudioSegment for concatenation
full_audio = AudioSegment.empty()
stream_completion = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"},
messages=[
{
"role": "user",
"content": "Can you tell me a funny short story about a pickle?"
}
],
stream=True
)
# Play the audio as it comes in and concatenate it
for chunk in stream_completion:
chunk_audio = getattr(chunk.choices[0].delta, 'audio', None)
if chunk_audio is not None:
pcm_bytes = base64.b64decode(chunk_audio.get('data', ''))
if pcm_bytes:
audio_segment = AudioSegment.from_raw(
io.BytesIO(pcm_bytes),
sample_width=2, # 16-bit PCM
frame_rate=24000, # 24kHz sample rate
channels=1 # Mono audio
)
play(audio_segment)
# Concatenate the audio segment
full_audio += audio_segment
# Save the concatenated audio to a file
full_audio.export("assets/audio/full_audio.wav", format="wav")
Any idea of how to smooth the audio out while I play it live via stream?
It's not clear how exactly this behaves without the imports you might be using, but it's plausible you need to retrieve the audio in another thread or process, otherwise the time it takes to fetch each next block will be added between each block of audio playback!
from queue import Queue
import threading
def fetcher(queue, exit_event):
while True:
data = "?.method()" # TODO get the next block here
if data is None: # TODO suitable exiting case
break
Q.put(data)
exit_event.set() # all blocks retrieved, begin exiting
def playback(queue, exit_event, retry_wait_seconds=0.1):
while not exit_event.is_set() and not queue.empty():
try: # NOTE a fast network should keep this buffer filled
data = queue.get(timeout=retry_wait_seconds)
except queue.Empty:
continue # next data block or exiting
# might be better to put this into the fetcher too and just play
audio_segment = AudioSegment.from_raw(
io.BytesIO(pcm_bytes),
sample_width=2, # 16-bit PCM
frame_rate=24000, # 24kHz sample rate
channels=1 # Mono audio
)
play(audio_segment)
def main():
Q = Queue()
E = threading.Event()
threads = []
threads.append(threading.Thread(target=fetcher, args=(Q,E,)))
threads.append(threading.Thread(target=playback, args=(Q,E,)))
for t in threads:
t.join()
If this still results in a playback gap, it's likely that your play function is broken or otherwise reinstantiating playback with each call
Alternatively, wait a little longer once and get all of it
play(full_audio)