ffmpeg: libavformat/libswresample to transcode and resample at same time

I want to transcode and down/re-sample the audio for output using ffmpeg's libav*/libswresample - I am using ffmpeg's (4.x) transcode_aac.c and resample_audio.c as reference - but the code produces audio with glitches that is clearly not what ffmpeg itself would produce (ie ffmpeg -i foo.wav -ar 22050 foo.m4a)

Based on the ffmpeg examples, to resample audio it appears that I need to set the output AVAudioContext and SwrContext sample_rate to what I desire and ensure the swr_convert() is provided with the correct number of output samples based av_rescale_rnd( swr_delay(), ...) once I have an decoded input audio. I've taken care to ensure all the relevant calculations of samples for output are taken into account in the merged code (below):

open_output_file() - AVCodecContext.sample_rate (avctx variable) set to our target (down sampled) sample_rate
read_decode_convert_and_store() is where the work happens: input audio is decoded to an AVFrame and this input frame is converted before being encoded.
- init_converted_samples() and av_samples_alloc() uses the input frame's nb_samples
- ADDED: calc the number of output samples via av_rescale_rnd() and swr_delay()
- UPDATED: convert_samples() and swr_convert() uses the input frame's samples and our calculated output samples as parameters

However the resulting audio file is produced with audio glitches. Does the community know of any references for how transcode AND resample should be done or what is missing in this example?

    /* compile and run:
         gcc -I/usr/include/ffmpeg  transcode-swr-aac.c  -lavformat -lavutil -lavcodec -lswresample -lm
         ./a.out foo.wav foo.m4a
    */

/*
 * Copyright (c) 2013-2018 Andreas Unterweger
 *  
 * This file is part of FFmpeg.                                                 
 ...                                                                       ...
 *   
 * @example transcode_aac.c                                                    
 * Convert an input audio file to AAC in an MP4 container using FFmpeg.         
 * Formats other than MP4 are supported based on the output file extension.                            
 * @author Andreas Unterweger (xxxx@xxxxx.com)
 */  
    #include <stdio.h>
 

    #include "libavformat/avformat.h"
    #include "libavformat/avio.h"
    
    #include "libavcodec/avcodec.h"
    
    #include "libavutil/audio_fifo.h"
    #include "libavutil/avassert.h"
    #include "libavutil/avstring.h"
    #include "libavutil/channel_layout.h"
    #include "libavutil/frame.h"
    #include "libavutil/opt.h"
    
    #include "libswresample/swresample.h"
    
    #define OUTPUT_BIT_RATE 128000
    #define OUTPUT_CHANNELS 2
    
    static int open_input_file(const char *filename,
                               AVFormatContext **input_format_context,
                               AVCodecContext **input_codec_context)
    {
        AVCodecContext *avctx;
        const AVCodec *input_codec;
        const AVStream *stream;
        int error;
    
        if ((error = avformat_open_input(input_format_context, filename, NULL,
                                         NULL)) < 0) {
            fprintf(stderr, "Could not open input file '%s' (error '%s')\n",
                    filename, av_err2str(error));
            *input_format_context = NULL;
            return error;
        }
    

        if ((error = avformat_find_stream_info(*input_format_context, NULL)) < 0) {
            fprintf(stderr, "Could not open find stream info (error '%s')\n",
                    av_err2str(error));
            avformat_close_input(input_format_context);
            return error;
        }
    
        if ((*input_format_context)->nb_streams != 1) {
            fprintf(stderr, "Expected one audio input stream, but found %d\n",
                    (*input_format_context)->nb_streams);
            avformat_close_input(input_format_context);
            return AVERROR_EXIT;
        }
    
        stream = (*input_format_context)->streams[0];
    
        if (!(input_codec = avcodec_find_decoder(stream->codecpar->codec_id))) {
            fprintf(stderr, "Could not find input codec\n");
            avformat_close_input(input_format_context);
            return AVERROR_EXIT;
        }
    
        avctx = avcodec_alloc_context3(input_codec);
        if (!avctx) {
            fprintf(stderr, "Could not allocate a decoding context\n");
            avformat_close_input(input_format_context);
            return AVERROR(ENOMEM);
        }
    
        /* Initialize the stream parameters with demuxer information. */
        error = avcodec_parameters_to_context(avctx, stream->codecpar);
        if (error < 0) {
            avformat_close_input(input_format_context);
            avcodec_free_context(&avctx);
            return error;
        }
    
        /* Open the decoder for the audio stream to use it later. */
        if ((error = avcodec_open2(avctx, input_codec, NULL)) < 0) {
            fprintf(stderr, "Could not open input codec (error '%s')\n",
                    av_err2str(error));
            avcodec_free_context(&avctx);
            avformat_close_input(input_format_context);
            return error;
        }
    
        /* Set the packet timebase for the decoder. */
        avctx->pkt_timebase = stream->time_base;
    
        /* Save the decoder context for easier access later. */
        *input_codec_context = avctx;
    
        return 0;
    }
    
    static int open_output_file(const char *filename,
                                AVCodecContext *input_codec_context,
                                AVFormatContext **output_format_context,
                                AVCodecContext **output_codec_context)
    {
        AVCodecContext *avctx          = NULL;
        AVIOContext *output_io_context = NULL;
        AVStream *stream               = NULL;
        const AVCodec *output_codec    = NULL;
        int error;
    

        if ((error = avio_open(&output_io_context, filename,
                               AVIO_FLAG_WRITE)) < 0) {
            fprintf(stderr, "Could not open output file '%s' (error '%s')\n",
                    filename, av_err2str(error));
            return error;
        }
    

        if (!(*output_format_context = avformat_alloc_context())) {
            fprintf(stderr, "Could not allocate output format context\n");
            return AVERROR(ENOMEM);
        }
    

        (*output_format_context)->pb = output_io_context;
    

        if (!((*output_format_context)->oformat = av_guess_format(NULL, filename,
                                                                  NULL))) {
            fprintf(stderr, "Could not find output file format\n");
            goto cleanup;
        }
    
        if (!((*output_format_context)->url = av_strdup(filename))) {
            fprintf(stderr, "Could not allocate url.\n");
            error = AVERROR(ENOMEM);
            goto cleanup;
        }
    

        if (!(output_codec = avcodec_find_encoder(AV_CODEC_ID_AAC))) {
            fprintf(stderr, "Could not find an AAC encoder.\n");
            goto cleanup;
        }
    
        /* Create a new audio stream in the output file container. */
        if (!(stream = avformat_new_stream(*output_format_context, NULL))) {
            fprintf(stderr, "Could not create new stream\n");
            error = AVERROR(ENOMEM);
            goto cleanup;
        }
    
        avctx = avcodec_alloc_context3(output_codec);
        if (!avctx) {
            fprintf(stderr, "Could not allocate an encoding context\n");
            error = AVERROR(ENOMEM);
            goto cleanup;
        }
    
   /* Set the basic encoder parameters.
    * SET OUR DESIRED output sample_rate here
    */
        avctx->channels       = OUTPUT_CHANNELS;
        avctx->channel_layout = av_get_default_channel_layout(OUTPUT_CHANNELS);
        // avctx->sample_rate    = input_codec_context->sample_rate;
        avctx->sample_rate    = 22050;
        avctx->sample_fmt     = output_codec->sample_fmts[0];
        avctx->bit_rate       = OUTPUT_BIT_RATE;
    
        avctx->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
    
        /* Set the sample rate for the container. */
        stream->time_base.den = avctx->sample_rate;
        stream->time_base.num = 1;
    
        if ((*output_format_context)->oformat->flags & AVFMT_GLOBALHEADER)
            avctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    
        if ((error = avcodec_open2(avctx, output_codec, NULL)) < 0) {
            fprintf(stderr, "Could not open output codec (error '%s')\n",
                    av_err2str(error));
            goto cleanup;
        }
    
        error = avcodec_parameters_from_context(stream->codecpar, avctx);
        if (error < 0) {
            fprintf(stderr, "Could not initialize stream parameters\n");
            goto cleanup;
        }
    
        /* Save the encoder context for easier access later. */
        *output_codec_context = avctx;
    
        return 0;
    
    cleanup:
        avcodec_free_context(&avctx);
        avio_closep(&(*output_format_context)->pb);
        avformat_free_context(*output_format_context);
        *output_format_context = NULL;
        return error < 0 ? error : AVERROR_EXIT;
    }
    
    /**
     * Initialize one data packet for reading or writing.
     */
    static int init_packet(AVPacket **packet)
    {
        if (!(*packet = av_packet_alloc())) {
            fprintf(stderr, "Could not allocate packet\n");
            return AVERROR(ENOMEM);
        }
        return 0;
    }
    
    static int init_input_frame(AVFrame **frame)
    {
        if (!(*frame = av_frame_alloc())) {
            fprintf(stderr, "Could not allocate input frame\n");
            return AVERROR(ENOMEM);
        }
        return 0;
    }
    
    static int init_resampler(AVCodecContext *input_codec_context,
                              AVCodecContext *output_codec_context,
                              SwrContext **resample_context)
    {
            int error;

  /**
   * create the resample, including ref to the desired output sample rate
   */
            *resample_context = swr_alloc_set_opts(NULL,
                                                  av_get_default_channel_layout(output_codec_context->channels),
                                                  output_codec_context->sample_fmt,
                                                  output_codec_context->sample_rate,
                              av_get_default_channel_layout(input_codec_context->channels),
                                                  input_codec_context->sample_fmt,
                                                  input_codec_context->sample_rate,
                                                  0, NULL);
            if (!*resample_context < 0) {
                fprintf(stderr, "Could not allocate resample context\n");
            return AVERROR(ENOMEM);
            }
    
            if ((error = swr_init(*resample_context)) < 0) {
                fprintf(stderr, "Could not open resample context\n");
                swr_free(resample_context);
                return error;
            }
        return 0;
    }
    
    static int init_fifo(AVAudioFifo **fifo, AVCodecContext *output_codec_context)
    {
        if (!(*fifo = av_audio_fifo_alloc(output_codec_context->sample_fmt,
                                          output_codec_context->channels, 1))) {
            fprintf(stderr, "Could not allocate FIFO\n");
            return AVERROR(ENOMEM);
        }
        return 0;
    }
    
    static int write_output_file_header(AVFormatContext *output_format_context)
    {
        int error;
        if ((error = avformat_write_header(output_format_context, NULL)) < 0) {
            fprintf(stderr, "Could not write output file header (error '%s')\n",
                    av_err2str(error));
            return error;
        }
        return 0;
    }
    
    static int decode_audio_frame(AVFrame *frame,
                                  AVFormatContext *input_format_context,
                                  AVCodecContext *input_codec_context,
                                  int *data_present, int *finished)
    {
        AVPacket *input_packet;
        int error;
    
        error = init_packet(&input_packet);
        if (error < 0)
            return error;
    
        *data_present = 0;
        *finished = 0;

        if ((error = av_read_frame(input_format_context, input_packet)) < 0) {
            if (error == AVERROR_EOF)
                *finished = 1;
            else {
                fprintf(stderr, "Could not read frame (error '%s')\n",
                        av_err2str(error));
                goto cleanup;
            }
        }
    
        if ((error = avcodec_send_packet(input_codec_context, input_packet)) < 0) {
            fprintf(stderr, "Could not send packet for decoding (error '%s')\n",
                    av_err2str(error));
            goto cleanup;
        }
    
        error = avcodec_receive_frame(input_codec_context, frame);
        if (error == AVERROR(EAGAIN)) {
            error = 0;
            goto cleanup;
        } else if (error == AVERROR_EOF) {
            *finished = 1;
            error = 0;
            goto cleanup;
        } else if (error < 0) {
            fprintf(stderr, "Could not decode frame (error '%s')\n",
                    av_err2str(error));
            goto cleanup;
        } else {
            *data_present = 1;
            goto cleanup;
        }
    
    cleanup:
        av_packet_free(&input_packet);
        return error;
    }
    
    static int init_converted_samples(uint8_t ***converted_input_samples,
                                      AVCodecContext *output_codec_context,
                                      int frame_size)
    {
        int error;
    
        if (!(*converted_input_samples = calloc(output_codec_context->channels,
                                                sizeof(**converted_input_samples)))) {
            fprintf(stderr, "Could not allocate converted input sample pointers\n");
            return AVERROR(ENOMEM);
        }
    

        if ((error = av_samples_alloc(*converted_input_samples, NULL,
                                      output_codec_context->channels,
                                      frame_size,
                                      output_codec_context->sample_fmt, 0)) < 0) {
            fprintf(stderr,
                    "Could not allocate converted input samples (error '%s')\n",
                    av_err2str(error));
            av_freep(&(*converted_input_samples)[0]);
            free(*converted_input_samples);
            return error;
        }
        return 0;
    }
    
    static int convert_samples(const uint8_t **input_data, const int input_nb_samples,
                               uint8_t **converted_data, const int output_nb_samples,
                               SwrContext *resample_context)
    {
        int error;
    
        if ((error = swr_convert(resample_context,
                                 converted_data, output_nb_samples,
                                 input_data    , input_nb_samples)) < 0) {
            fprintf(stderr, "Could not convert input samples (error '%s')\n",
                    av_err2str(error));
            return error;
        }
    
        return 0;
    }
    
    static int add_samples_to_fifo(AVAudioFifo *fifo,
                                   uint8_t **converted_input_samples,
                                   const int frame_size)
    {
        int error;
    
        if ((error = av_audio_fifo_realloc(fifo, av_audio_fifo_size(fifo) + frame_size)) < 0) {
            fprintf(stderr, "Could not reallocate FIFO\n");
            return error;
        }
    
        if (av_audio_fifo_write(fifo, (void **)converted_input_samples,
                                frame_size) < frame_size) {
            fprintf(stderr, "Could not write data to FIFO\n");
            return AVERROR_EXIT;
        }
        return 0;
    }
    
    static int read_decode_convert_and_store(AVAudioFifo *fifo,
                                             AVFormatContext *input_format_context,
                                             AVCodecContext *input_codec_context,
                                             AVCodecContext *output_codec_context,
                                             SwrContext *resampler_context,
                                             int *finished)
    {
        AVFrame *input_frame = NULL;
        uint8_t **converted_input_samples = NULL;
        int data_present;
        int ret = AVERROR_EXIT;
    

        if (init_input_frame(&input_frame))
            goto cleanup;

        if (decode_audio_frame(input_frame, input_format_context,
                               input_codec_context, &data_present, finished))
            goto cleanup;

        if (*finished) {
            ret = 0;
            goto cleanup;
        }

        if (data_present) {
            /* Initialize the temporary storage for the converted input samples. */
            if (init_converted_samples(&converted_input_samples, output_codec_context,
                                       input_frame->nb_samples))
                goto cleanup;
 
    /* figure out how many samples are required for target sample_rate incl
     * any items left in the swr buffer
     */   
            int  output_nb_samples = av_rescale_rnd(
                                       swr_get_delay(resampler_context, input_codec_context->sample_rate) + input_frame->nb_samples,
                                       output_codec_context->sample_rate, 
                                        input_codec_context->sample_rate,
                                       AV_ROUND_UP);
 
            /* ignore, just to ensure we've got enough buffer alloc'd for conversion buffer */
            av_assert1(input_frame->nb_samples > output_nb_samples);
   
    /* Convert the input samples to the desired output sample format, via swr_convert().
     */
            if (convert_samples((const uint8_t**)input_frame->extended_data, input_frame->nb_samples,
                        converted_input_samples, output_nb_samples,
                    resampler_context))
                goto cleanup;
    
            /* Add the converted input samples to the FIFO buffer for later processing. */
            if (add_samples_to_fifo(fifo, converted_input_samples,
                                    output_nb_samples))
                goto cleanup;
            ret = 0;
        }
        ret = 0;
    
    cleanup:
        if (converted_input_samples) {
            av_freep(&converted_input_samples[0]);
            free(converted_input_samples);
        }
        av_frame_free(&input_frame);
    
        return ret;
    }
    
    static int init_output_frame(AVFrame **frame,
                                 AVCodecContext *output_codec_context,
                                 int frame_size)
    {
        int error;
    
        if (!(*frame = av_frame_alloc())) {
            fprintf(stderr, "Could not allocate output frame\n");
            return AVERROR_EXIT;
        }
    
        /* Set the frame's parameters, especially its size and format.
         * av_frame_get_buffer needs this to allocate memory for the
         * audio samples of the frame.
         * Default channel layouts based on the number of channels
         * are assumed for simplicity. */
        (*frame)->nb_samples     = frame_size;
        (*frame)->channel_layout = output_codec_context->channel_layout;
        (*frame)->format         = output_codec_context->sample_fmt;
        (*frame)->sample_rate    = output_codec_context->sample_rate;
    
        /* Allocate the samples of the created frame. This call will make
         * sure that the audio frame can hold as many samples as specified. */
        if ((error = av_frame_get_buffer(*frame, 0)) < 0) {
            fprintf(stderr, "Could not allocate output frame samples (error '%s')\n",
                    av_err2str(error));
            av_frame_free(frame);
            return error;
        }
    
        return 0;
    }
    
    /* Global timestamp for the audio frames. */
    static int64_t pts = 0;
    
    /**
     * Encode one frame worth of audio to the output file.
     */
    static int encode_audio_frame(AVFrame *frame,
                                  AVFormatContext *output_format_context,
                                  AVCodecContext *output_codec_context,
                                  int *data_present)
    {
        AVPacket *output_packet;
        int error;
    
        error = init_packet(&output_packet);
        if (error < 0)
            return error;
    
        /* Set a timestamp based on the sample rate for the container. */
        if (frame) {
            frame->pts = pts;
            pts += frame->nb_samples;
        }
    
        *data_present = 0;
        error = avcodec_send_frame(output_codec_context, frame);
        if (error < 0 && error != AVERROR_EOF) {
          fprintf(stderr, "Could not send packet for encoding (error '%s')\n",
                  av_err2str(error));
          goto cleanup;
        }
    

        error = avcodec_receive_packet(output_codec_context, output_packet);
        if (error == AVERROR(EAGAIN)) {
            error = 0;
            goto cleanup;
        } else if (error == AVERROR_EOF) {
            error = 0;
            goto cleanup;
        } else if (error < 0) {
            fprintf(stderr, "Could not encode frame (error '%s')\n",
                    av_err2str(error));
            goto cleanup;
        } else {
            *data_present = 1;
        }
    
        /* Write one audio frame from the temporary packet to the output file. */
        if (*data_present &&
            (error = av_write_frame(output_format_context, output_packet)) < 0) {
            fprintf(stderr, "Could not write frame (error '%s')\n",
                    av_err2str(error));
            goto cleanup;
        }
    
    cleanup:
        av_packet_free(&output_packet);
        return error;
    }
    
    /**
     * Load one audio frame from the FIFO buffer, encode and write it to the
     * output file.
     */
    static int load_encode_and_write(AVAudioFifo *fifo,
                                     AVFormatContext *output_format_context,
                                     AVCodecContext *output_codec_context)
    {
        AVFrame *output_frame;
        /* Use the maximum number of possible samples per frame.
         * If there is less than the maximum possible frame size in the FIFO
         * buffer use this number. Otherwise, use the maximum possible frame size. */
        const int frame_size = FFMIN(av_audio_fifo_size(fifo),
                                     output_codec_context->frame_size);
        int data_written;
    
        if (init_output_frame(&output_frame, output_codec_context, frame_size))
            return AVERROR_EXIT;
    
        /* Read as many samples from the FIFO buffer as required to fill the frame.
         * The samples are stored in the frame temporarily. */
        if (av_audio_fifo_read(fifo, (void **)output_frame->data, frame_size) < frame_size) {
            fprintf(stderr, "Could not read data from FIFO\n");
            av_frame_free(&output_frame);
            return AVERROR_EXIT;
        }
    
        /* Encode one frame worth of audio samples. */
        if (encode_audio_frame(output_frame, output_format_context,
                               output_codec_context, &data_written)) {
            av_frame_free(&output_frame);
            return AVERROR_EXIT;
        }
        av_frame_free(&output_frame);
        return 0;
    }
    
    /**
     * Write the trailer of the output file container.
     */
    static int write_output_file_trailer(AVFormatContext *output_format_context)
    {
        int error;
        if ((error = av_write_trailer(output_format_context)) < 0) {
            fprintf(stderr, "Could not write output file trailer (error '%s')\n",
                    av_err2str(error));
            return error;
        }
        return 0;
    }
    
    int main(int argc, char **argv)
    {
        AVFormatContext *input_format_context = NULL, *output_format_context = NULL;
        AVCodecContext *input_codec_context = NULL, *output_codec_context = NULL;
        SwrContext *resample_context = NULL;
        AVAudioFifo *fifo = NULL;
        int ret = AVERROR_EXIT;
    
        if (argc != 3) {
            fprintf(stderr, "Usage: %s <input file> <output file>\n", argv[0]);
            exit(1);
        }
    

        if (open_input_file(argv[1], &input_format_context,
                            &input_codec_context))
            goto cleanup;

        if (open_output_file(argv[2], input_codec_context,
                             &output_format_context, &output_codec_context))
            goto cleanup;

        if (init_resampler(input_codec_context, output_codec_context,
                           &resample_context))
            goto cleanup;

        if (init_fifo(&fifo, output_codec_context))
            goto cleanup;

        if (write_output_file_header(output_format_context))
            goto cleanup;
    
        while (1) {
            /* Use the encoder's desired frame size for processing. */
            const int output_frame_size = output_codec_context->frame_size;
            int finished                = 0;
    
            while (av_audio_fifo_size(fifo) < output_frame_size) {
                /* Decode one frame worth of audio samples, convert it to the
                 * output sample format and put it into the FIFO buffer. */
                if (read_decode_convert_and_store(fifo, input_format_context,
                                                  input_codec_context,
                                                  output_codec_context,
                                                  resample_context, &finished))
                    goto cleanup;
    
                if (finished)
                    break;
            }
    
            while (av_audio_fifo_size(fifo) >= output_frame_size ||
                   (finished && av_audio_fifo_size(fifo) > 0))
                if (load_encode_and_write(fifo, output_format_context,
                                          output_codec_context))
                    goto cleanup;
    
            if (finished) {
                int data_written;
                do {
                    if (encode_audio_frame(NULL, output_format_context,
                                           output_codec_context, &data_written))
                        goto cleanup;
                } while (data_written);
                break;
            }
        }
    
        if (write_output_file_trailer(output_format_context))
            goto cleanup;
        ret = 0;
    
    cleanup:
        if (fifo)
            av_audio_fifo_free(fifo);
        swr_free(&resample_context);
        if (output_codec_context)
            avcodec_free_context(&output_codec_context);
        if (output_format_context) {
            avio_closep(&output_format_context->pb);
            avformat_free_context(output_format_context);
        }
        if (input_codec_context)
            avcodec_free_context(&input_codec_context);
        if (input_format_context)
            avformat_close_input(&input_format_context);
    
        return ret;
    }

Solution

After going through the ffmpeg/libav mailing list, particularly https://ffmpeg.org/pipermail/libav-user/2017-July/010496.html, I was able to modify the ffmpeg transcode_aac.c example to perform the sample rate conversion.

In the original code, the main loop reads/decode/covert/store in one function before passing the samples to a AVAudioFifo which is used by the encoder.

Some encoders expects a specific number of samples - if you provide less, it appears the encoder pads up to expected and this results in the glitches mentioned in my first attempt.

The key, as per the ffmpeg mailing list, is to buffer / concat the decoded input samples until we have enough samples for at least one frame for the encoder. To do this we split the read/decode from the convert/store with the read/decode data being stored in a new intermediary AVAudioFifo. Once the intermediary fifo has enough samples, they get converted and the output is added to the original fifo.

static int read_decode_and_store(AVAudioFifo *fifo,
                 AVFormatContext *input_format_context,
                 AVCodecContext *input_codec_context,
                 const int audio_stream_idx,
                 int *finished)
{
    AVFrame *input_frame = NULL;
    int data_present = 0;
    int ret = AVERROR_EXIT;

    if (init_input_frame(&input_frame))
        goto cleanup;

    if (decode_audio_frame(input_frame, input_format_context,
                           input_codec_context, audio_stream_idx, &data_present, finished))
        goto cleanup;

    if (*finished) {
        ret = 0;
        goto cleanup;
    }

    if (data_present) {
        /* Add the converted input samples to the FIFO buffer for later processing. */
        if (add_samples_to_fifo(fifo, (uint8_t**)input_frame->extended_data, input_frame->nb_samples))
            goto cleanup;
    }
    ret = 0;

cleanup:
    av_frame_free(&input_frame);

    return ret;
}



static int  load_convert_and_store(AVAudioFifo* output_samples_fifo, const AVFormatContext* output_context, AVCodecContext* output_codec_context, int output_frame_size,
                               AVAudioFifo* input_samples_fifo,  const AVFormatContext* input_context, AVCodecContext* input_codec_context,
                   SwrContext* resample_context)
{
    uint8_t **converted_input_samples = NULL;
    int  ret = AVERROR_EXIT;

    AVFrame *input_frame;
    const int frame_size = FFMIN(av_audio_fifo_size(input_samples_fifo),
                                 output_frame_size);

    // yes this is init_output_frame
    if (init_output_frame(&input_frame, input_codec_context, frame_size))
        return AVERROR_EXIT;

    if (av_audio_fifo_read(input_samples_fifo, (void **)input_frame->data, frame_size) < frame_size) {
        fprintf(stderr, "Could not read data from input samples FIFO");
        av_frame_free(&input_frame);
        return AVERROR_EXIT;
    }

    int  nb_samples = (output_codec_context->sample_rate == input_codec_context->sample_rate) ?
    input_frame->nb_samples :
    av_rescale_rnd(swr_get_delay(resample_context, input_codec_context->sample_rate) + input_frame->nb_samples, output_codec_context->sample_rate, input_codec_context->sample_rate, AV_ROUND_UP);

    if (init_converted_samples(&converted_input_samples, output_codec_context,
        nb_samples))
    goto cleanup;

    /* **** Modify convert_samples() to return the value from swr_convert() **** */
    if ( (nb_samples = convert_samples((const uint8_t**)input_frame->extended_data, input_frame->nb_samples,
        converted_input_samples, output_codec_context->frame_size,
        resample_context)) < 0)
    goto cleanup;

    if (add_samples_to_fifo(output_samples_fifo, converted_input_samples, nb_samples))
    goto cleanup;

    ret = 0;

cleanup:
    if (converted_input_samples) {
        av_freep(&converted_input_samples[0]);
        free(converted_input_samples);
    }
    av_frame_free(&input_frame);

    return ret;
}


int main()
{
   ...

    while (1)
    {
        const int output_frame_size = output_codec_context->frame_size;
        int finished                = 0;

    /* Re: Resample frame to specified number of samples
     * https://ffmpeg.org/pipermail/libav-user/2017-July/010496.html
     * Yes, you need to buffer sufficient audio frames to feed to the encoder.
     *
     * Calculate the number of in samples:
     in_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx, c->sample_rate) +
     out_nb_samples,
     in_sample_rate, c->sample_rate, AV_ROUND_DOWN);

     then allocate buffers to concatenate the in samples until you have enough
     to pass to swr_ctx.
     */
    while (av_audio_fifo_size(input_samples_fifo) < output_frame_size) {
        if (read_decode_and_store(input_samples_fifo,
            input_format_context, input_codec_context,
            audio_stream_idx,
            &finished))
        goto cleanup;

        if (finished)
        break;
    }

    while (av_audio_fifo_size(input_samples_fifo) >= output_frame_size ||
        (finished && av_audio_fifo_size(input_samples_fifo) > 0)) {
        /* take all input samples and convert them before handing off to encoder
         */
        if (load_convert_and_store(fifo,
            output_format_context, output_codec_context, output_frame_size,
            input_samples_fifo, input_format_context, input_codec_context,
            resample_context))
        goto cleanup;
    }
    }


    /* If we have enough samples for the encoder, we encode them.
     * At the end of the file, we pass the remaining samples to
     * the encoder. */
    .... // existing code

}