How to convert same audio twice using libswresamples's swr_convert

I'm working on an audio processing system that sometimes requires that the same audio be resampled twice. The first resampling of the audio from FFmpeg works fine, the second results in distorted audio. I've reproduced this problem by modifying the resampling_audio example provided by FFmpeg. How do I convert the same audio twice using swr_convert?

Below I've attached a modified version of the resampling_audio example. In order to reproduce the issue, follow these steps:

Clone FFmepg project at https://github.com/FFmpeg/FFmpeg
Run ./configure
Run make -j4 examples (this will take awhile the first time)
Run doc/examples/resampling_audio to produce expected output
Replace doc/examples/resampling_audio.c with the version I've attached below
Run make -j4 examples
Run doc/examples/resampling_audio again (with new args) to output two new files (one for each conversion).
Import each file into Audacity as raw data, the first file should be 44100 Hz, the second should be 32000 Hz.
The first file will sound the same as the original, the second file will be distorted.

The environment I ran this in was Ubuntu 16.04; I then copied the output files to a Windows PC to open them in Audacity.

Here is my modified resampling_audio.c file. I've created some new variables and copied the blocks of code that do the conversion. The first conversion should be unchanged, the second conversion takes in data from the first conversion and attempts to convert it again.

/*
 * Copyright (c) 2012 Stefano Sabatini
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/**
 * @example resampling_audio.c
 * libswresample API use example.
 */

#include <libavutil/opt.h>
#include <libavutil/channel_layout.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>

static int get_format_from_sample_fmt(const char **fmt,
                                      enum AVSampleFormat sample_fmt)
{
    int i;
    struct sample_fmt_entry {
        enum AVSampleFormat sample_fmt; const char *fmt_be, *fmt_le;
    } sample_fmt_entries[] = {
        { AV_SAMPLE_FMT_U8,  "u8",    "u8"    },
        { AV_SAMPLE_FMT_S16, "s16be", "s16le" },
        { AV_SAMPLE_FMT_S32, "s32be", "s32le" },
        { AV_SAMPLE_FMT_FLT, "f32be", "f32le" },
        { AV_SAMPLE_FMT_DBL, "f64be", "f64le" },
    };
    *fmt = NULL;

    for (i = 0; i < FF_ARRAY_ELEMS(sample_fmt_entries); i++) {
        struct sample_fmt_entry *entry = &sample_fmt_entries[i];
        if (sample_fmt == entry->sample_fmt) {
            *fmt = AV_NE(entry->fmt_be, entry->fmt_le);
            return 0;
        }
    }

    fprintf(stderr,
            "Sample format %s not supported as output format\n",
            av_get_sample_fmt_name(sample_fmt));
    return AVERROR(EINVAL);
}

/**
 * Fill dst buffer with nb_samples, generated starting from t.
 */
static void fill_samples(double *dst, int nb_samples, int nb_channels, int sample_rate, double *t)
{
    int i, j;
    double tincr = 1.0 / sample_rate, *dstp = dst;
    const double c = 2 * M_PI * 440.0;

    /* generate sin tone with 440Hz frequency and duplicated channels */
    for (i = 0; i < nb_samples; i++) {
        *dstp = sin(c * *t);
        for (j = 1; j < nb_channels; j++)
            dstp[j] = dstp[0];
        dstp += nb_channels;
        *t += tincr;
    }
}

int main(int argc, char **argv)
{
    int64_t src_ch_layout = AV_CH_LAYOUT_STEREO, dst_ch_layout = AV_CH_LAYOUT_SURROUND;
    int src_rate = 48000, dst_rate = 44100;
    uint8_t **src_data = NULL, **dst_data = NULL, **dst_data2 = NULL;
    int src_nb_channels = 0, dst_nb_channels = 0;
    int src_linesize, dst_linesize;
    int src_nb_samples = 1024, dst_nb_samples, max_dst_nb_samples, dst_nb_samples2, max_dst_nb_samples2;
    enum AVSampleFormat src_sample_fmt = AV_SAMPLE_FMT_DBL, dst_sample_fmt = AV_SAMPLE_FMT_S16;
    const char *dst_filename = NULL, *dst_filename2 = NULL;
    FILE *dst_file, *dst_file2;
    int dst_bufsize, dst_bufsize2;
    const char *fmt;
    struct SwrContext *swr_ctx;
    struct SwrContext *swr_ctx2;
    double t;
    int ret;

    if (argc != 3) {
        fprintf(stderr, "Usage: %s output_file_first output_file_second\n"
                "API example program to show how to resample an audio stream with libswresample.\n"
                "This program generates a series of audio frames, resamples them to a specified "
                "output format and rate and saves them to an output file named output_file.\n",
            argv[0]);
        exit(1);
    }
    dst_filename = argv[1];
    dst_filename2 = argv[2];

    dst_file = fopen(dst_filename, "wb");
    if (!dst_file) {
        fprintf(stderr, "Could not open destination file %s\n", dst_filename);
        exit(1);
    }



    dst_file2 = fopen(dst_filename2, "wb");
    if (!dst_file2) {
        fprintf(stderr, "Could not open destination file 2 %s\n", dst_filename2);
        exit(1);
    }



    /* create resampler context */
    swr_ctx = swr_alloc();
    if (!swr_ctx) {
        fprintf(stderr, "Could not allocate resampler context\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    /* set options */
    av_opt_set_int(swr_ctx, "in_channel_layout",    src_ch_layout, 0);
    av_opt_set_int(swr_ctx, "in_sample_rate",       src_rate, 0);
    av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", src_sample_fmt, 0);

    av_opt_set_int(swr_ctx, "out_channel_layout",    dst_ch_layout, 0);
    av_opt_set_int(swr_ctx, "out_sample_rate",       dst_rate, 0);
    av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", dst_sample_fmt, 0);

    /* initialize the resampling context */
    if ((ret = swr_init(swr_ctx)) < 0) {
        fprintf(stderr, "Failed to initialize the resampling context\n");
        goto end;
    }


    /* create resampler context 2 */
    swr_ctx2 = swr_alloc();
    if (!swr_ctx2) {
        fprintf(stderr, "Could not allocate resampler context 2\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    /* set options */
    av_opt_set_int(swr_ctx2, "in_channel_layout",    dst_ch_layout, 0);
    av_opt_set_int(swr_ctx2, "in_sample_rate",       dst_rate, 0);
    av_opt_set_sample_fmt(swr_ctx2, "in_sample_fmt", dst_sample_fmt, 0);

    av_opt_set_int(swr_ctx2, "out_channel_layout",    dst_ch_layout, 0);
    av_opt_set_int(swr_ctx2, "out_sample_rate",       32000, 0);
    av_opt_set_sample_fmt(swr_ctx2, "out_sample_fmt", dst_sample_fmt, 0);

    /* initialize the resampling context */
    if ((ret = swr_init(swr_ctx2)) < 0) {
        fprintf(stderr, "Failed to initialize the resampling context 2\n");
        goto end;
    }

    /* allocate source and destination samples buffers */

    src_nb_channels = av_get_channel_layout_nb_channels(src_ch_layout);
    ret = av_samples_alloc_array_and_samples(&src_data, &src_linesize, src_nb_channels,
                                             src_nb_samples, src_sample_fmt, 0);
    if (ret < 0) {
        fprintf(stderr, "Could not allocate source samples\n");
        goto end;
    }

    /* compute the number of converted samples: buffering is avoided
     * ensuring that the output buffer will contain at least all the
     * converted input samples */
    max_dst_nb_samples = dst_nb_samples =
        av_rescale_rnd(src_nb_samples, dst_rate, src_rate, AV_ROUND_UP);

    /* buffer is going to be directly written to a rawaudio file, no alignment */
    dst_nb_channels = av_get_channel_layout_nb_channels(dst_ch_layout);
    ret = av_samples_alloc_array_and_samples(&dst_data, &dst_linesize, dst_nb_channels,
                                             dst_nb_samples, dst_sample_fmt, 0);
    if (ret < 0) {
        fprintf(stderr, "Could not allocate destination samples\n");
        goto end;
    }


    /* compute the number of converted samples: buffering is avoided
     * ensuring that the output buffer will contain at least all the
     * converted input samples */
    max_dst_nb_samples2 = dst_nb_samples2 =
        av_rescale_rnd(dst_nb_samples, 32000, dst_rate, AV_ROUND_UP);

    /* buffer is going to be directly written to a rawaudio file, no alignment */
    // dst_nb_channels2  = av_get_channel_layout_nb_channels(dst_ch_layout);
    ret = av_samples_alloc_array_and_samples(&dst_data2, &dst_linesize, dst_nb_channels,
                                             dst_nb_samples2, dst_sample_fmt, 0);
    if (ret < 0) {
        fprintf(stderr, "Could not allocate destination samples 2\n");
        goto end;
    }

    t = 0;
    do {
        /* generate synthetic audio */
        fill_samples((double *)src_data[0], src_nb_samples, src_nb_channels, src_rate, &t);

        /* compute destination number of samples */
        dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx, src_rate) +
                                        src_nb_samples, dst_rate, src_rate, AV_ROUND_UP);
        if (dst_nb_samples > max_dst_nb_samples) {
            av_freep(&dst_data[0]);
            ret = av_samples_alloc(dst_data, &dst_linesize, dst_nb_channels,
                                   dst_nb_samples, dst_sample_fmt, 1);
            if (ret < 0)
                break;
            max_dst_nb_samples = dst_nb_samples;
        }

        /* convert to destination format */
        ret = swr_convert(swr_ctx, dst_data, dst_nb_samples, (const uint8_t **)src_data, src_nb_samples);
        if (ret < 0) {
            fprintf(stderr, "Error while converting\n");
            goto end;
        }

        dst_bufsize = av_samples_get_buffer_size(&dst_linesize, dst_nb_channels,
                                                 ret, dst_sample_fmt, 1);
        if (dst_bufsize < 0) {
            fprintf(stderr, "Could not get sample buffer size\n");
            goto end;
        }

        printf("t:%f in:%d out:%d\n", t, src_nb_samples, ret);
        fwrite(dst_data[0], 1, dst_bufsize, dst_file);

        /* compute destination number of samples 2 */
        dst_nb_samples2 = av_rescale_rnd(swr_get_delay(swr_ctx2, dst_rate) +
                                        dst_nb_samples2, 32000, dst_rate, AV_ROUND_UP);
        if (dst_nb_samples2 > max_dst_nb_samples2) {
            av_freep(&dst_data2[0]);
            ret = av_samples_alloc(dst_data2, &dst_linesize, dst_nb_channels,
                                   dst_nb_samples2, dst_sample_fmt, 1);
            if (ret < 0)
                break;
            max_dst_nb_samples2 = dst_nb_samples2;
        }

        /* convert to destination format */
        ret = swr_convert(swr_ctx2, dst_data2, dst_nb_samples2, (const uint8_t **)dst_data, dst_nb_samples);
        if (ret < 0) {
            fprintf(stderr, "Error while converting 2\n");
            goto end;
        }

        dst_bufsize2 = av_samples_get_buffer_size(&dst_linesize, dst_nb_channels,
                                                 ret, dst_sample_fmt, 1);
        if (dst_bufsize2 < 0) {
            fprintf(stderr, "Could not get sample buffer size 2\n");
            goto end;
        }

        printf("t:%f in:%d out:%d\n", t, dst_nb_samples, ret);
        fwrite(dst_data2[0], 1, dst_bufsize2, dst_file2);
    } while (t < 10);

    if ((ret = get_format_from_sample_fmt(&fmt, dst_sample_fmt)) < 0)
        goto end;
    fprintf(stderr, "Resampling succeeded. Play the output file with the command:\n"
            "ffplay -f %s -channel_layout %"PRId64" -channels %d -ar %d %s\n",
            fmt, dst_ch_layout, dst_nb_channels, dst_rate, dst_filename);

end:
    fclose(dst_file);

    if (src_data)
        av_freep(&src_data[0]);
    av_freep(&src_data);

    if (dst_data)
        av_freep(&dst_data[0]);
    av_freep(&dst_data);

    swr_free(&swr_ctx);
    return ret < 0;
}

Solution

I'd check to make sure you're passing the correct buffers into the input for each call to swr_convert(). Remember you need to flush the output of swr_convert(), so if you're passing the output of a call to swr_convert() into a second call, make sure flush the first swr_context first.

Edit by OP: Since the first audio file using the same output data had no problems, it's likely flushing the first convert in and of itself wouldn't solve the problem. However, if the first convert isn't fully being flushed, the second convert would need to use the output number of samples from the first convert (ret), not the expected number (dst_nb_samples) as the input number of samples. Making this fix solved the problem.