I’m working on a RTS task as part of a project.
Here’s the flow:
I am using WebSocket + PyAV to mux the audio and video streams together to a buffer (instead of writing to disk).
Created the container like this:
self.mux_buffer = io.BytesIO()
self.mux_container = av.open(
self.mux_buffer,
mode="w",
format="mp4",
options={
"movflags": "frag_keyframe+empty_moov+default_base_moof"
}
)
Through logs, I can confirm that nothing is being written to the buffer. Any solution for this?
Or is there a better approach for real-time audio chunks+video frames(with sync) streaming?
You’re opening the muxer but never encode + mux any packets. MP4 writes nothing until the first packets arrive (header is emitted on first mux). Add streams, encode frames/chunks, and mux the returned packets; also use fMP4 flags so fragments are pushed as soon as a keyframe appears.
import io, av
import numpy as np
from fractions import Fraction
W, H, FPS, SR, STEREO = 640, 360, 25, 48000, 2
SAMPLES_PER_FRAME = SR // FPS # 48000/25 = 1920
buf = io.BytesIO()
oc = av.open(
buf, mode="w", format="mp4",
options={
"movflags": "empty_moov+frag_keyframe+default_base_moof",
"flush_packets": "1" # flush fragments promptly
}
)
# VIDEO stream
v = oc.add_stream("libx264", rate=FPS) # or "h264" depending on your build
v.width, v.height = W, H
v.pix_fmt = "yuv420p"
v.time_base = Fraction(1, FPS)
v.codec_context.options = {
"tune": "zerolatency",
"preset": "veryfast",
"g": str(FPS), # 1 keyframe/sec => one fMP4 fragment/sec
"keyint_min": str(FPS),
"sc_threshold": "0"
}
# AUDIO stream
a = oc.add_stream("aac", rate=SR)
a.layout = "stereo"
a.time_base = Fraction(1, SR)
# Produce a few frames to prove bytes get written
video_pts = 0
audio_pts = 0
for i in range(60): # ~2.4s
# fake video frame (solid luminance ramp)
rgb = np.full((H, W, 3), i % 255, dtype=np.uint8)
vf = av.VideoFrame.from_ndarray(rgb, format="rgb24").reformat(format="yuv420p")
vf.pts = video_pts
video_pts += 1
for pkt in v.encode(vf):
oc.mux(pkt)
# fake audio chunk (silence)
samples = np.zeros((STEREO, SAMPLES_PER_FRAME), dtype=np.int16)
af = av.AudioFrame.from_ndarray(samples, format="s16", layout="stereo")
af.sample_rate = SR
af.pts = audio_pts
audio_pts += SAMPLES_PER_FRAME
for pkt in a.encode(af):
oc.mux(pkt)
# flush encoders so the last fragment is written
for pkt in v.encode(None): oc.mux(pkt)
for pkt in a.encode(None): oc.mux(pkt)
oc.close()
print("bytes written:", len(buf.getvalue()))