cffmpegtcprtspaac

ERROR "Application provided invalid, non monotonically increasing dts to muxer in stream 1: 6874 >= 6874" while writing encoded output to an mp4 file


I have a running RTSP stream, streaming video on a loop using the following FFMPEG command:

ffmpeg -re -stream_loop -1 -i ./ffmpeg_c_test/small_bunny_1080p_60fps.mp4 -ac 2 -f rtsp -rtsp_transport tcp rtsp://localhost:8554/mystream

The video file is obtained from the github link: https://github.com/leandromoreira/ffmpeg-libav-tutorial

I keep getting error response, when I calling the function av_interleaved_write_frame called from the function remux in the attached program. The output format is mp4, output video codec is av1 and output audio codec is same as input audio codec. The error is from audio stream.

I tried to create a "minimal reproducible code", however, I think it is still not completely minimal, but it reproduces the exact error.

#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/timestamp.h>
#include <libavutil/opt.h>
#include <libswscale/swscale.h>
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>

#include <string.h>
#include <inttypes.h>

typedef struct StreamingContext{
    AVFormatContext* avfc;
    const AVCodec *video_avc;
    const AVCodec *audio_avc;
    AVStream *video_avs;
    AVStream *audio_avs;
    AVCodecContext *video_avcc;
    AVCodecContext *audio_avcc;
    int video_index;
    int audio_index;
    char* filename;
    struct SwsContext *sws_ctx;
}StreamingContext;


typedef struct StreamingParams{
    char copy_video;
    char copy_audio;
    char *output_extension;
    char *muxer_opt_key;
    char *muxer_opt_value;
    char *video_codec;
    char *audio_codec;
    char *codec_priv_key;
    char *codec_priv_value;
}StreamingParams;

void logging(const char *fmt, ...)
{
    va_list args;
    fprintf(stderr, "LOG: ");
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
    fprintf(stderr, "\n");
}

int fill_stream_info(AVStream *avs, const AVCodec **avc, AVCodecContext **avcc)
{
    *avc = avcodec_find_decoder(avs->codecpar->codec_id);
    *avcc = avcodec_alloc_context3(*avc);
    if (avcodec_parameters_to_context(*avcc, avs->codecpar) < 0)
    {
        logging("Failed to fill Codec Context.");
        return -1;
    }
    avcodec_open2(*avcc, *avc, NULL);
    return 0;
}

int open_media(const char *in_filename, AVFormatContext **avfc)
{
    *avfc = avformat_alloc_context();
    if (avformat_open_input(avfc, in_filename, NULL, NULL) != 0)
    {
        logging("Failed to open input file %s", in_filename);
        return -1;
    }

    if (avformat_find_stream_info(*avfc, NULL) < 0)
    {
        logging("Failed to get Stream Info.");
        return -1;
    }
}

int prepare_decoder(StreamingContext *sc)
{
    for (int i = 0; i < (int)sc->avfc->nb_streams; i++)
    {
        if (sc->avfc->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
        {
            sc->video_avs = sc->avfc->streams[i];
            sc->video_index = i;

            if (fill_stream_info(sc->video_avs, &sc->video_avc, &sc->video_avcc))
            {
                return -1;
            }
        }
        else if (sc->avfc->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
        {
            sc->audio_avs = sc->avfc->streams[i];
            sc->audio_index = i;

            if (fill_stream_info(sc->audio_avs, &sc->audio_avc, &sc->audio_avcc))
            {
                return -1;
            }
        }
        else
        {
            logging("Skipping Streams other than Audio and Video.");
        }
    }
    return 0;
}

int prepare_video_encoder(StreamingContext *encoder_sc, AVCodecContext *decoder_ctx, AVRational input_framerate,
                          StreamingParams sp, int scaled_frame_width, int scaled_frame_height)
{
    encoder_sc->video_avs = avformat_new_stream(encoder_sc->avfc, NULL);
    encoder_sc->video_avc = avcodec_find_encoder_by_name(sp.video_codec);
    if (!encoder_sc->video_avc)
    {
        logging("Cannot find the Codec.");
        return -1;
    }

    encoder_sc->video_avcc = avcodec_alloc_context3(encoder_sc->video_avc);
    if (!encoder_sc->video_avcc)
    {
        logging("Could not allocate memory for Codec Context.");
        return -1;
    }

    av_opt_set(encoder_sc->video_avcc->priv_data, "preset", "fast", 0);
    if (sp.codec_priv_key && sp.codec_priv_value)
        av_opt_set(encoder_sc->video_avcc->priv_data, sp.codec_priv_key, sp.codec_priv_value, 0);

    encoder_sc->video_avcc->height = scaled_frame_height;
    encoder_sc->video_avcc->width = scaled_frame_width;
    encoder_sc->video_avcc->sample_aspect_ratio = decoder_ctx->sample_aspect_ratio;

    if (encoder_sc->video_avc->pix_fmts)
        encoder_sc->video_avcc->pix_fmt = encoder_sc->video_avc->pix_fmts[0];
    else
        encoder_sc->video_avcc->pix_fmt = decoder_ctx->pix_fmt;

    encoder_sc->video_avcc->bit_rate = 2 * 1000 * 1000;

    encoder_sc->video_avcc->time_base = av_inv_q(input_framerate);
    encoder_sc->video_avs->time_base = encoder_sc->video_avcc->time_base;

    

    if (avcodec_open2(encoder_sc->video_avcc, encoder_sc->video_avc, NULL) < 0)
    {
        logging("Could not open the Codec.");
        return -1;
    }
    avcodec_parameters_from_context(encoder_sc->video_avs->codecpar, encoder_sc->video_avcc);
    return 0;
}


int prepare_copy(AVFormatContext *avfc, AVStream **avs, AVCodecParameters *decoder_par)
{
    *avs = avformat_new_stream(avfc, NULL);
    avcodec_parameters_copy((*avs)->codecpar, decoder_par);
    return 0;
}

int encode_video(StreamingContext *decoder, StreamingContext *encoder, AVFrame *input_frame)
{
    if (input_frame)
        input_frame->pict_type = AV_PICTURE_TYPE_NONE;

    AVPacket *output_packet = av_packet_alloc();


    int response = avcodec_send_frame(encoder->video_avcc, input_frame);

    while (response >= 0)
    {
        response = avcodec_receive_packet(encoder->video_avcc, output_packet);
        if (response == AVERROR(EAGAIN) || response == AVERROR_EOF)
        {
            break;
        }

        output_packet->stream_index = decoder->video_index;
        output_packet->duration = encoder->video_avs->time_base.den / encoder->video_avs->time_base.num;

        av_packet_rescale_ts(output_packet, decoder->video_avs->time_base, encoder->video_avs->time_base);
        response = av_interleaved_write_frame(encoder->avfc, output_packet);
    }

    av_packet_unref(output_packet);
    av_packet_free(&output_packet);

    return 0;
}

int remux(AVPacket **pkt, AVFormatContext **avfc, AVRational decoder_tb, AVRational encoder_tb)
{
    (*pkt)->duration = av_rescale_q((*pkt)->duration, decoder_tb, encoder_tb);
    (*pkt)->pos = -1;
    av_packet_rescale_ts(*pkt, decoder_tb, encoder_tb);
    if (av_interleaved_write_frame(*avfc, *pkt) < 0)
    {
        logging("Error while copying Stream Packet.");
        return -1;
    }
    return 0;
}

int transcode_video(StreamingContext *decoder, StreamingContext *encoder, AVPacket *input_packet, AVFrame *input_frame)
{
    int response = avcodec_send_packet(decoder->video_avcc, input_packet);
    while (response >= 0)
    {
        response = avcodec_receive_frame(decoder->video_avcc, input_frame);
        
        if (response == AVERROR(EAGAIN) || response == AVERROR_EOF)
        {
            break;
        }
        if (response >= 0)
        {
            if (encode_video(decoder, encoder, input_frame))
                return -1;
        }

        av_frame_unref(input_frame);
    }
    return 0;
}

int main(int argc, char *argv[])
{
    const int scaled_frame_width = 854;
    const int scaled_frame_height = 480;
    StreamingParams sp = {0};
    sp.copy_audio = 1;
    sp.copy_video = 0;
    sp.video_codec = "libsvtav1";
    
    StreamingContext *decoder = (StreamingContext *)calloc(1, sizeof(StreamingContext));
    decoder->filename = "rtsp://localhost:8554/mystream";

    StreamingContext *encoder = (StreamingContext *)calloc(1, sizeof(StreamingContext));
    encoder->filename = "small_bunny_9.mp4";
    
    if (sp.output_extension)
    {
        strcat(encoder->filename, sp.output_extension);
    }

    open_media(decoder->filename, &decoder->avfc);
    prepare_decoder(decoder);


    avformat_alloc_output_context2(&encoder->avfc, NULL, "mp4", encoder->filename);
    AVRational input_framerate = av_guess_frame_rate(decoder->avfc, decoder->video_avs, NULL);
    prepare_video_encoder(encoder, decoder->video_avcc, input_framerate, sp, scaled_frame_width, scaled_frame_height);

    prepare_copy(encoder->avfc, &encoder->audio_avs, decoder->audio_avs->codecpar);
        

    if (encoder->avfc->oformat->flags & AVFMT_GLOBALHEADER)
        encoder->avfc->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;

    if (!(encoder->avfc->oformat->flags & AVFMT_NOFILE))
    {
        if (avio_open(&encoder->avfc->pb, encoder->filename, AVIO_FLAG_WRITE) < 0)
        {
            logging("could not open the output file");
            return -1;
        }
    }

    
    if (avformat_write_header(encoder->avfc, NULL) < 0)
    {
        logging("an error occurred when opening output file");
        return -1;
    }

    AVFrame *input_frame = av_frame_alloc();
    AVPacket *input_packet = av_packet_alloc();

    while (1)
    {
        int ret = av_read_frame(decoder->avfc, input_packet);
        if(ret<0)
            break;
        if (decoder->avfc->streams[input_packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
        {
            if (transcode_video(decoder, encoder, input_packet, input_frame))
                return -1;
            av_packet_unref(input_packet);

        }
        else if (decoder->avfc->streams[input_packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
        {
            
                if (remux(&input_packet, &encoder->avfc, decoder->audio_avs->time_base, encoder->audio_avs->time_base))
                    return -1;
        }
        else
        {
            logging("Ignoring all nonvideo or audio packets.");
        }
    }

    if (encode_video(decoder, encoder, NULL))
        return -1;
    

    av_write_trailer(encoder->avfc);


    if (input_frame != NULL)
    {
        av_frame_free(&input_frame);
        input_frame = NULL;
    }

    if (input_packet != NULL)
    {
        av_packet_free(&input_packet);
        input_packet = NULL;
    }

    avformat_close_input(&decoder->avfc);

    avformat_free_context(decoder->avfc);
    decoder->avfc = NULL;
    avformat_free_context(encoder->avfc);
    encoder->avfc = NULL;

    avcodec_free_context(&decoder->video_avcc);
    decoder->video_avcc = NULL;
    avcodec_free_context(&decoder->audio_avcc);
    decoder->audio_avcc = NULL;

    free(decoder);
    decoder = NULL;
    free(encoder);
    encoder = NULL;

    return 0;
}


Solution

  • We may correct the DTS timestamps when remuxing the audio packets for making sure that the DTS timestamps are monotonously increased.

    The "non monotonically increasing dts" issue is mainly a result of streaming the input video file in a loop.

    Each time the loop starts, the timestamps starts from the beginning.
    The remuxing procedure copies the timestamps from the input to the output, so there is a "non-increased DTS scenario" each time the loop restarts.

    We have to fix the DTS timestamps to keep them monotonously increased.
    We may also fix the PTS timestamps the same way, because the PTS should match the DTS in a valid audio stream.


    Note:
    For keeping the synchronization between the audio and the video, we may also have to fix the timesteps of video stream.

    After fixing the audio, the video encoding is also not working.
    My answer is going to address only the "non monotonically increasing dts" issue.
    Fixing the video stream seems too challenging.

    Note:
    Using AV1 video encoder may raise other issues, since AV1 encoding may not meet the Realtime constraints (can't keep up with the rate of streaming input).
    For getting a playable output file, I used libx264 encoder instead of libsvtav1.
    In case your CPU is "strong" enough, AV1 may also work.


    For "manually" fixing the DTS (and PTS) timestamps, we may use the following structure for tracking the timestamps and the required offsets:

    typedef struct TimestampsTracking{
        int64_t prv_pts;
        int64_t pts_offset;
        int64_t prv_dts;
        int64_t dts_offset;
        int64_t prv_duration;
    } TimestampsTracking;
    

    Pass a pointer to the structure to remux function:

    int remux(AVPacket **pkt, AVFormatContext **avfc, AVRational decoder_tb, AVRational encoder_tb, TimestampsTracking *tt)
    

    After av_packet_rescale_ts(*pkt, decoder_tb, encoder_tb), keep the original timestamps, and add offsets that fixes the timestamps:

    int64_t orig_pts = (*pkt)->pts;
    int64_t orig_dts = (*pkt)->dts;
    (*pkt)->pts += tt->pts_offset; //Add offset to PTS (the offset ensures that the PTS are correct after each loop of the input video file).
    (*pkt)->dts += tt->dts_offset; //Add offset to DTS (the offset ensures that the DTS are correct and monotonous after each loop of the input video file).
    

    After adding the offset, we may check if the timestamps are not monotonically increased.
    In case they are not monotonically increased, fix the timestamp to be monotonically increased, and update the offset (to be used in the next packet):

    if ((*pkt)->dts < tt->prv_dts)
    {
        //Wrapped around...
        //Set the DTS to be the previous DTS plus duration of a single packet.
        //Note: add the duration of the previous packet instead of adding (*pkt)->duration, because the last audio packet in the file may be shorter than the nominal audio packet duration.
        (*pkt)->dts = tt->prv_dts + tt->prv_duration;
        tt->dts_offset = (*pkt)->dts - orig_dts;
    }
    
    //Use the same solution for the PTS...
    

    We also have to store the timestamps for to be used in the next packet:

    tt->prv_pts = (*pkt)->pts;  //Store PTS of previous packet
    tt->prv_dts = (*pkt)->dts;  //Store DTS of previous packet
    tt->prv_duration = (*pkt)->duration; //Store the duration of the previous packet.
    

    Now we can execute av_interleaved_write_frame without getting an error:

    if (av_interleaved_write_frame(*avfc, *pkt) < 0)
    {
        logging("Error while copying Stream Packet.");
        return -1;
    }
    

    Complete updated code:


    Note:


    Update:

    The source of the problem is related to the timestamps of the input audio and video due to the looping.
    We may correct the timestamps of the input using setpts and asetpts filters:

    ffmpeg -re -stream_loop -1 -i small_bunny_1080p_60fps.mp4 -filter_complex "[0:v]setpts=N/FRAME_RATE/TB[v];[0:a]asetpts=N/SR/TB[a]" -map "[v]" -map "[a]" -ac 2 -f rtsp -rtsp_transport tcp rtsp://localhost:8554/mystream

    We may still need a DTS timestamps protection at the beginning (but we may simplify the logic).


    For getting valid output file, we have to close the output file gracefully.

    Instead of while(1), we may loop until Esc key is pressed.

    In Windows, we may use _kbhit() and _getch() functions for getting the last key pressed without blocking.

    Updated code sample:

    #include <libavcodec/avcodec.h>
    #include <libavformat/avformat.h>
    #include <libavutil/timestamp.h>
    #include <libavutil/opt.h>
    #include <libswscale/swscale.h>
    #include <stdio.h>
    #include <stdarg.h>
    #include <stdlib.h>
    
    #include <string.h>
    #include <inttypes.h>
    
    //Include conio.h for using _kbhit (Windows only).
    #include <conio.h>
    
    typedef struct StreamingContext{
        AVFormatContext* avfc;
        const AVCodec *video_avc;
        const AVCodec *audio_avc;
        AVStream *video_avs;
        AVStream *audio_avs;
        AVCodecContext *video_avcc;
        AVCodecContext *audio_avcc;
        int video_index;
        int audio_index;
        char* filename;
        struct SwsContext *sws_ctx;
    }StreamingContext;
    
    
    typedef struct StreamingParams{
        char copy_video;
        char copy_audio;
        char *output_extension;
        char *muxer_opt_key;
        char *muxer_opt_value;
        char *video_codec;
        char *audio_codec;
        char *codec_priv_key;
        char *codec_priv_value;
    }StreamingParams;
    
    
    
    //Track the PTS and DTS timestamps for adjusting the offset
    //Adding offset is used for avoiding non-monotonous timestamps. 
    ////////////////////////////////////////////////////////////////////////////////
    typedef struct TimestampsTracking{
        int64_t prv_pts;
        int64_t pts_offset;
        int64_t prv_dts;
        int64_t dts_offset;
        int64_t prv_duration;
    } TimestampsTracking;
    ////////////////////////////////////////////////////////////////////////////////
    
    
    void logging(const char *fmt, ...)
    {
        va_list args;
        fprintf(stderr, "LOG: ");
        va_start(args, fmt);
        vfprintf(stderr, fmt, args);
        va_end(args);
        fprintf(stderr, "\n");
    }
    
    int fill_stream_info(AVStream *avs, const AVCodec **avc, AVCodecContext **avcc)
    {
        *avc = avcodec_find_decoder(avs->codecpar->codec_id);
        *avcc = avcodec_alloc_context3(*avc);
        if (avcodec_parameters_to_context(*avcc, avs->codecpar) < 0)
        {
            logging("Failed to fill Codec Context.");
            return -1;
        }
        avcodec_open2(*avcc, *avc, NULL);
        return 0;
    }
    
    int open_media(const char *in_filename, AVFormatContext **avfc)
    {
        *avfc = avformat_alloc_context();
        if (avformat_open_input(avfc, in_filename, NULL, NULL) != 0)
        {
            logging("Failed to open input file %s", in_filename);
            return -1;
        }
    
        if (avformat_find_stream_info(*avfc, NULL) < 0)
        {
            logging("Failed to get Stream Info.");
            return -1;
        }
    
        return 0;
    }
    
    int prepare_decoder(StreamingContext *sc)
    {
        for (int i = 0; i < (int)sc->avfc->nb_streams; i++)
        {
            if (sc->avfc->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
            {
                sc->video_avs = sc->avfc->streams[i];
                sc->video_index = i;
    
                if (fill_stream_info(sc->video_avs, &sc->video_avc, &sc->video_avcc))
                {
                    return -1;
                }
            }
            else if (sc->avfc->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
            {
                sc->audio_avs = sc->avfc->streams[i];
                sc->audio_index = i;
    
                if (fill_stream_info(sc->audio_avs, &sc->audio_avc, &sc->audio_avcc))
                {
                    return -1;
                }
            }
            else
            {
                logging("Skipping Streams other than Audio and Video.");
            }
        }
        return 0;
    }
    
    int prepare_video_encoder(StreamingContext *encoder_sc, AVCodecContext *decoder_ctx, AVRational input_framerate,
                              StreamingParams sp, int scaled_frame_width, int scaled_frame_height)
    {
        encoder_sc->video_avs = avformat_new_stream(encoder_sc->avfc, NULL);
        encoder_sc->video_avc = avcodec_find_encoder_by_name(sp.video_codec);
        if (!encoder_sc->video_avc)
        {
            logging("Cannot find the Codec.");
            return -1;
        }
    
        encoder_sc->video_avcc = avcodec_alloc_context3(encoder_sc->video_avc);
        if (!encoder_sc->video_avcc)
        {
            logging("Could not allocate memory for Codec Context.");
            return -1;
        }
    
        av_opt_set(encoder_sc->video_avcc->priv_data, "preset", "fast", 0);  //Unable to parse option value "fast"
        //av_opt_set(encoder_sc->video_avcc->priv_data, "preset", "-1", 0);  //Encoding preset (from -1 to 13) (default -1)
        if (sp.codec_priv_key && sp.codec_priv_value)
            av_opt_set(encoder_sc->video_avcc->priv_data, sp.codec_priv_key, sp.codec_priv_value, 0);
    
        encoder_sc->video_avcc->height = scaled_frame_height;
        encoder_sc->video_avcc->width = scaled_frame_width;
        encoder_sc->video_avcc->sample_aspect_ratio = decoder_ctx->sample_aspect_ratio;
    
        if (encoder_sc->video_avc->pix_fmts)
            encoder_sc->video_avcc->pix_fmt = encoder_sc->video_avc->pix_fmts[0];
        else
            encoder_sc->video_avcc->pix_fmt = decoder_ctx->pix_fmt;
    
        encoder_sc->video_avcc->bit_rate = 2 * 1000 * 1000;
    
        encoder_sc->video_avcc->time_base = av_inv_q(input_framerate);
        encoder_sc->video_avs->time_base = encoder_sc->video_avcc->time_base;
    
        
    
        if (avcodec_open2(encoder_sc->video_avcc, encoder_sc->video_avc, NULL) < 0)
        {
            logging("Could not open the Codec.");
            return -1;
        }
        avcodec_parameters_from_context(encoder_sc->video_avs->codecpar, encoder_sc->video_avcc);
        return 0;
    }
    
    
    int prepare_copy(AVFormatContext *avfc, AVStream **avs, AVCodecParameters *decoder_par)
    {
        *avs = avformat_new_stream(avfc, NULL);
        avcodec_parameters_copy((*avs)->codecpar, decoder_par);
        return 0;
    }
    
    int encode_video(StreamingContext *decoder, StreamingContext *encoder, AVFrame *input_frame)
    {
        if (input_frame)
            input_frame->pict_type = AV_PICTURE_TYPE_NONE;
    
        AVPacket *output_packet = av_packet_alloc();
    
    
        int response = avcodec_send_frame(encoder->video_avcc, input_frame);
    
        while (response >= 0)
        {
            response = avcodec_receive_packet(encoder->video_avcc, output_packet);
            if (response == AVERROR(EAGAIN) || response == AVERROR_EOF)
            {
                break;
            }
    
            output_packet->stream_index = decoder->video_index;
            output_packet->duration = encoder->video_avs->time_base.den / encoder->video_avs->time_base.num;
    
            av_packet_rescale_ts(output_packet, decoder->video_avs->time_base, encoder->video_avs->time_base);
            response = av_interleaved_write_frame(encoder->avfc, output_packet);
        }
    
        av_packet_unref(output_packet);
        av_packet_free(&output_packet);
    
        return 0;
    }
    
    //Add TimestampsTracking argument to remux function.
    int remux(AVPacket **pkt, AVFormatContext **avfc, AVRational decoder_tb, AVRational encoder_tb, TimestampsTracking *tt)
    {
        (*pkt)->duration = av_rescale_q((*pkt)->duration, decoder_tb, encoder_tb);
        (*pkt)->pos = -1;
    
        av_packet_rescale_ts(*pkt, decoder_tb, encoder_tb);
    
        int64_t orig_pts = (*pkt)->pts;
        int64_t orig_dts = (*pkt)->dts;
        (*pkt)->pts += tt->pts_offset; //Add offset to PTS (the offset ensures that the PTS are correct after each loop of the input video file).
        (*pkt)->dts += tt->dts_offset; //Add offset to DTS (the offset ensures that the DTS are correct and monotonous after each loop of the input video file).
    
        //The input video file wrapped around - we have to fix dts_offset for keeping the DTS monotonous increment.
        ////////////////////////////////////////////////////////////////////////////
        if ((*pkt)->dts <= tt->prv_dts)
        {
            //Wrapped around...
            //Set the DTS to be the previous DTS plus duration of a single packet.
            //Note: add the duration of the previous packet instead of adding (*pkt)->duration, because the last audio packet in the file may be shorter than the nominal audio packet duration.
            (*pkt)->dts = tt->prv_dts + tt->prv_duration;
            tt->dts_offset = (*pkt)->dts - orig_dts;
        }
        ////////////////////////////////////////////////////////////////////////////
    
        //Use the same solution for the PTS
        //In case of audio, the PTS are monotonously increased.
        //In case of video the PTS may not be monotonously increased (due to B-Frames).
        //The solution may not work for video packets.
        ////////////////////////////////////////////////////////////////////////////
        if ((*pkt)->pts <= tt->prv_pts)
        {
            //Wrapped around...
            (*pkt)->pts = tt->prv_pts + tt->prv_duration;  //Set the PTS to be the previous PTS plus duration of a single packet.
            tt->pts_offset = (*pkt)->pts - orig_pts;
        }
        ////////////////////////////////////////////////////////////////////////////
    
        //printf("dts = %lld\n", (*pkt)->dts);
        //printf("DTS delta = %lld\n", (*pkt)->dts - tt->prv_dts);
    
    
        tt->prv_pts = (*pkt)->pts;  //Store PTS of previous packet
        tt->prv_dts = (*pkt)->dts;  //Store DTS of previous packet
        tt->prv_duration = (*pkt)->duration; //Store the duration of the previous packet.
    
    
        if (av_interleaved_write_frame(*avfc, *pkt) < 0)
        {
            logging("Error while copying Stream Packet.");
            return -1;
        }
        return 0;
    }
    
    
    int transcode_video(StreamingContext *decoder, StreamingContext *encoder, AVPacket *input_packet, AVFrame *input_frame)
    {
        int response = avcodec_send_packet(decoder->video_avcc, input_packet);
        while (response >= 0)
        {
            response = avcodec_receive_frame(decoder->video_avcc, input_frame);
            
            if (response == AVERROR(EAGAIN) || response == AVERROR_EOF)
            {
                break;
            }
            if (response >= 0)
            {
                if (encode_video(decoder, encoder, input_frame))
                    return -1;
            }
    
            av_frame_unref(input_frame);
        }
        return 0;
    }
    
    int main(int argc, char *argv[]){
        const int scaled_frame_width = 854;
        const int scaled_frame_height = 480;
        StreamingParams sp = {0};
    
        //TimestampsTracking is used for avoiding non-monotonous timestamps. 
        ////////////////////////////////////////////////////////////////////////////////
        TimestampsTracking audio_tt = {0};
        audio_tt.prv_pts = 0;
        audio_tt.pts_offset = 0;
        audio_tt.prv_dts = 0;
        audio_tt.dts_offset = 0;
        audio_tt.prv_duration = 0;
        ////////////////////////////////////////////////////////////////////////////////
    
        sp.copy_audio = 1;
        sp.copy_video = 0;
        //sp.video_codec = "libsvtav1";
        sp.video_codec = "libx264";  //Use libx264 codec for testing
        
        StreamingContext *decoder = (StreamingContext *)calloc(1, sizeof(StreamingContext));
        decoder->filename = "rtsp://localhost:8554/mystream";
    
        StreamingContext *encoder = (StreamingContext *)calloc(1, sizeof(StreamingContext));
        encoder->filename = "small_bunny_9.mp4";
        
        if (sp.output_extension)
        {
            strcat(encoder->filename, sp.output_extension);
        }
    
        open_media(decoder->filename, &decoder->avfc);
        prepare_decoder(decoder);
    
    
        avformat_alloc_output_context2(&encoder->avfc, NULL, "mp4", encoder->filename);
        AVRational input_framerate = av_guess_frame_rate(decoder->avfc, decoder->video_avs, NULL);
        prepare_video_encoder(encoder, decoder->video_avcc, input_framerate, sp, scaled_frame_width, scaled_frame_height);
    
        prepare_copy(encoder->avfc, &encoder->audio_avs, decoder->audio_avs->codecpar);
            
    
        if (encoder->avfc->oformat->flags & AVFMT_GLOBALHEADER)
            encoder->avfc->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    
        if (!(encoder->avfc->oformat->flags & AVFMT_NOFILE))
        {
            if (avio_open(&encoder->avfc->pb, encoder->filename, AVIO_FLAG_WRITE) < 0)
            {
                logging("could not open the output file");
                return -1;
            }
        }
    
        
        if (avformat_write_header(encoder->avfc, NULL) < 0)
        {
            logging("an error occurred when opening output file");
            return -1;
        }
    
        AVFrame *input_frame = av_frame_alloc();
        AVPacket *input_packet = av_packet_alloc();
       
        int key = 0;
    
        //Ends the loop when Esc key is pressed.
        while (key != 27)
        {
            if (_kbhit())
            {
                key = _getch();
            }
    
            int ret = av_read_frame(decoder->avfc, input_packet);
            if(ret<0)
                break;
            if (decoder->avfc->streams[input_packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
            {
                if (transcode_video(decoder, encoder, input_packet, input_frame))
                    return -1;
                av_packet_unref(input_packet);
    
            }
            else if (decoder->avfc->streams[input_packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
            {
                if (remux(&input_packet, &encoder->avfc, decoder->audio_avs->time_base, encoder->audio_avs->time_base, &audio_tt))
                    return -1;
    
                ////////////////////////////////////////////////////////////////////
                av_packet_unref(input_packet);
                ////////////////////////////////////////////////////////////////////
            }
            else
            {
                logging("Ignoring all nonvideo or audio packets.");
            }
        }
    
        if (encode_video(decoder, encoder, NULL))
            return -1;
        
    
        av_write_trailer(encoder->avfc);
    
    
        if (input_frame != NULL)
        {
            av_frame_free(&input_frame);
            input_frame = NULL;
        }
    
        if (input_packet != NULL)
        {
            av_packet_free(&input_packet);
            input_packet = NULL;
        }
    
        avformat_close_input(&decoder->avfc);
    
        avformat_free_context(decoder->avfc);
        decoder->avfc = NULL;
        avformat_free_context(encoder->avfc);
        encoder->avfc = NULL;
    
        avcodec_free_context(&decoder->video_avcc);
        decoder->video_avcc = NULL;
        avcodec_free_context(&decoder->audio_avcc);
        decoder->audio_avcc = NULL;
    
        free(decoder);
        decoder = NULL;
        free(encoder);
        encoder = NULL;
    
        return 0;
    }
    

    Now both the audio and the video looks and sounds OK.