I am writing an audio transcoding application using ffmpeg libraries. Here is my code
/*
* File: main.cpp
* Author: vinod
* Compile with "g++ -std=c++11 -o audiotranscode main.cpp -lavformat -lavcodec -lavutil -lavfilter"
*
*/
#if !defined PRId64 || PRI_MACROS_BROKEN
#undef PRId64
#define PRId64 "lld"
#endif
#define __STDC_FORMAT_MACROS
#ifdef __cplusplus
extern "C" {
#endif
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <stdint.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libavutil/frame.h>
#include <libavutil/timestamp.h>
#include <libavformat/avformat.h>
#include <libavfilter/avfilter.h>
#include <libavfilter/buffersrc.h>
#include <libavfilter/buffersink.h>
#include <libswscale/swscale.h>
#include <libavutil/opt.h>
#ifdef __cplusplus
}
#endif
#include <iostream>
using namespace std;
int select_stream, got_frame, got_packet;
AVFormatContext *in_fmt_ctx = NULL, *out_fmt_ctx = NULL;
AVCodec *dec_codec = NULL, * enc_codec = NULL;
AVStream *audio_st = NULL;
AVCodecContext *enc_ctx = NULL, *dec_ctx = NULL;
AVFrame *pFrame = NULL, * pFrameFiltered = NULL;
AVFilterGraph *filter_graph = NULL;
AVFilterContext *buffersrc_ctx = NULL;
AVFilterContext *buffersink_ctx = NULL;
AVPacket packet;
string inFileName = "/home/vinod/vinod/Media/univac.webm";
string outFileName = "audio_extracted.m4a";
int target_bit_rate = 128000,
sample_rate = 22050,
channels = 1;
AVSampleFormat sample_fmt = AV_SAMPLE_FMT_S16;
string filter_description = "aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono";
int log_averror(int errcode)
{
char *errbuf = (char *) calloc(AV_ERROR_MAX_STRING_SIZE, sizeof(char));
av_strerror(errcode, errbuf, AV_ERROR_MAX_STRING_SIZE);
std::cout << "Error - " << errbuf << std::endl;
delete [] errbuf;
return -1;
}
/**
* Initialize conversion filter */
int initialize_audio_filter()
{
char args[512];
int ret;
AVFilter *buffersrc = avfilter_get_by_name("abuffer");
AVFilter *buffersink = avfilter_get_by_name("abuffersink");
AVFilterInOut *outputs = avfilter_inout_alloc();
AVFilterInOut *inputs = avfilter_inout_alloc();
filter_graph = avfilter_graph_alloc();
const enum AVSampleFormat out_sample_fmts[] = {sample_fmt, AV_SAMPLE_FMT_NONE};
const int64_t out_channel_layouts[] = {av_get_default_channel_layout(out_fmt_ctx -> streams[0] -> codec -> channels), -1};
const int out_sample_rates[] = {out_fmt_ctx -> streams[0] -> codec -> sample_rate, -1};
if (!dec_ctx->channel_layout)
dec_ctx->channel_layout = av_get_default_channel_layout(dec_ctx->channels);
snprintf(args, sizeof(args), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64,
in_fmt_ctx -> streams[select_stream] -> time_base.num, in_fmt_ctx -> streams[select_stream] -> time_base.den,
dec_ctx->sample_rate,
av_get_sample_fmt_name(dec_ctx->sample_fmt),
dec_ctx->channel_layout);
ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in", args, NULL, filter_graph);
if (ret < 0) {
av_log(NULL, AV_LOG_ERROR, "Cannot create buffer source\n");
return -1;
}
ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out", NULL, NULL, filter_graph);
if (ret < 0) {
av_log(NULL, AV_LOG_ERROR, "Cannot create buffer sink\n");
return ret;
}
ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1,
AV_OPT_SEARCH_CHILDREN);
if (ret < 0) {
av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format\n");
return ret;
}
ret = av_opt_set_int_list(buffersink_ctx, "channel_layouts", out_channel_layouts, -1,
AV_OPT_SEARCH_CHILDREN);
if (ret < 0) {
av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout\n");
return ret;
}
ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1,
AV_OPT_SEARCH_CHILDREN);
if (ret < 0) {
av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate\n");
return ret;
}
/* Endpoints for the filter graph. */
outputs -> name = av_strdup("in");
outputs -> filter_ctx = buffersrc_ctx;
outputs -> pad_idx = 0;
outputs -> next = NULL;
/* Endpoints for the filter graph. */
inputs -> name = av_strdup("out");
inputs -> filter_ctx = buffersink_ctx;
inputs -> pad_idx = 0;
inputs -> next = NULL;
string filter_desc = filter_description;
if ((ret = avfilter_graph_parse_ptr(filter_graph, filter_desc.c_str(), &inputs, &outputs, NULL)) < 0) {
log_averror(ret);
exit(1);
}
if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) {
log_averror(ret);
exit(1);
}
/* Print summary of the sink buffer
* Note: args buffer is reused to store channel layout string */
AVFilterLink *outlink = buffersink_ctx->inputs[0];
av_get_channel_layout_string(args, sizeof(args), -1, outlink->channel_layout);
av_log(NULL, AV_LOG_INFO, "Output: srate:%dHz fmt:%s chlayout:%s\n",
(int) outlink->sample_rate,
(char *) av_x_if_null(av_get_sample_fmt_name((AVSampleFormat) outlink->format), "?"),
args);
return 0;
}
/*
*
*/
int main(int argc, char **argv)
{
int ret;
cout << "Hello World" << endl;
printf("abcd");
avcodec_register_all();
av_register_all();
avfilter_register_all();
/* open input file, and allocate format context */
if (avformat_open_input(&in_fmt_ctx, inFileName.c_str(), NULL, NULL) < 0) {
std::cout << "error opening input file - " << inFileName << std::endl;
return -1;
}
/* retrieve stream information */
if (avformat_find_stream_info(in_fmt_ctx, NULL) < 0) {
std::cerr << "Could not find stream information in the input file " << inFileName << std::endl;
}
/* Dump format details */
printf("\n ---------------------------------------------------------------------- \n");
av_dump_format(in_fmt_ctx, 0, inFileName.c_str(), 0);
printf("\n ---------------------------------------------------------------------- \n");
/* Choose a audio stream */
select_stream = av_find_best_stream(in_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec_codec, 0);
if (select_stream == AVERROR_STREAM_NOT_FOUND) {
std::cerr << "No audio stream found" << std::endl;
return -1;
}
if (select_stream == AVERROR_DECODER_NOT_FOUND) {
std::cerr << "No suitable decoder found" << std::endl;
return -1;
}
dec_ctx = in_fmt_ctx -> streams[ select_stream] -> codec;
av_opt_set_int(dec_ctx, "refcounted_frames", 1, 0);
/* init the audio decoder */
if ((ret = avcodec_open2(dec_ctx, dec_codec, NULL)) < 0) {
av_log(NULL, AV_LOG_ERROR, "Cannot open audio decoder\n");
return ret;
}
/* allocate output context */
ret = avformat_alloc_output_context2(&out_fmt_ctx, NULL, NULL,
outFileName.c_str());
if (ret < 0) {
std::cerr << "Could not create output context for the file " << outFileName << std::endl;
return -1;
}
/* find the encoder */
enum AVCodecID codec_id = out_fmt_ctx -> oformat -> audio_codec;
enc_codec = avcodec_find_encoder(codec_id);
if (!(enc_codec)) {
std::cerr << "Could not find encoder for - " << avcodec_get_name(codec_id) << std::endl;
return -1;
}
/* add a new stream */
audio_st = avformat_new_stream(out_fmt_ctx, enc_codec);
if (!audio_st) {
std::cerr << "Could not add audio stream - " << std::endl;
}
/* Initialise audio codec */
audio_st -> id = out_fmt_ctx -> nb_streams - 1;
enc_ctx = audio_st -> codec;
enc_ctx -> codec_id = codec_id;
enc_ctx -> codec_type = AVMEDIA_TYPE_AUDIO;
enc_ctx -> bit_rate = target_bit_rate;
enc_ctx -> sample_rate = sample_rate;
enc_ctx -> sample_fmt = sample_fmt;
enc_ctx -> channels = channels;
enc_ctx -> channel_layout = av_get_default_channel_layout(enc_ctx -> channels);
/* Some formats want stream headers to be separate. */
if (out_fmt_ctx -> oformat -> flags & AVFMT_GLOBALHEADER) {
enc_ctx -> flags |= CODEC_FLAG_GLOBAL_HEADER;
}
ret = avcodec_open2(out_fmt_ctx -> streams[0] -> codec, enc_codec, NULL);
if (ret < 0) {
std::cerr << "Could not create codec context for the file " << outFileName << std::endl;
return -1;
}
/* Initialize filter */
initialize_audio_filter();
if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE)) {
int ret = avio_open(& out_fmt_ctx -> pb, outFileName.c_str(),
AVIO_FLAG_WRITE);
if (ret < 0) {
log_averror(ret);
return -1;
}
}
/* Write header */
if (avformat_write_header(out_fmt_ctx, NULL) < 0) {
if (ret < 0) {
log_averror(ret);
return -1;
}
}
/* Allocate frame */
pFrame = av_frame_alloc();
if (!pFrame) {
std::cerr << "Could not allocate frame\n";
return -1;
}
pFrameFiltered = av_frame_alloc();
if (!pFrameFiltered) {
std::cerr << "Could not allocate frame\n";
return -1;
}
av_init_packet(&packet);
packet.data = NULL;
packet.size = 0;
/* Read packet from the stream */
while (av_read_frame(in_fmt_ctx, &packet) >= 0) {
if (packet.stream_index == select_stream) {
avcodec_get_frame_defaults(pFrame);
ret = avcodec_decode_audio4(dec_ctx, pFrame, &got_frame, &packet);
if (ret < 0) {
log_averror(ret);
return ret;
}
printf("Decoded packet pts : %ld ", packet.pts);
printf("Frame Best Effor pts : %ld \n", pFrame->best_effort_timestamp);
/* Set frame pts */
pFrame -> pts = av_frame_get_best_effort_timestamp(pFrame);
if (got_frame) {
/* push the decoded frame into the filtergraph */
ret = av_buffersrc_add_frame_flags(buffersrc_ctx, pFrame, AV_BUFFERSRC_FLAG_KEEP_REF);
if (ret < 0) {
log_averror(ret);
return ret;
}
/* pull filtered frames from the filtergraph */
while (1) {
ret = av_buffersink_get_frame(buffersink_ctx, pFrameFiltered);
if ((ret == AVERROR(EAGAIN)) || (ret == AVERROR_EOF)) {
break;
}
if (ret < 0) {
printf("Error while getting filtered frames from filtergraph\n");
log_averror(ret);
return -1;
}
/* Initialize the packets */
AVPacket encodedPacket = {0};
av_init_packet(&encodedPacket);
ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, pFrameFiltered, &got_packet);
if (!ret && got_packet && encodedPacket.size) {
/* Set correct pts and dts */
if (encodedPacket.pts != AV_NOPTS_VALUE) {
encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base,
out_fmt_ctx -> streams[0] -> time_base);
}
if (encodedPacket.dts != AV_NOPTS_VALUE) {
encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base,
out_fmt_ctx -> streams[0] -> time_base);
}
printf("Encoded packet pts %ld\n", encodedPacket.pts);
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket);
if (ret < 0) {
log_averror(ret);
return -1;
}
} else if (ret < 0) {
log_averror(ret);
return -1;
}
av_frame_unref(pFrameFiltered);
}
av_frame_unref(pFrame);
}
}
}
/* Flush delayed frames from encoder*/
got_packet=1;
while (got_packet) {
AVPacket encodedPacket = {0};
av_init_packet(&encodedPacket);
ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, NULL, &got_packet);
if (!ret && got_packet && encodedPacket.size) {
/* Set correct pts and dts */
if (encodedPacket.pts != AV_NOPTS_VALUE) {
encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base,
out_fmt_ctx -> streams[0] -> time_base);
}
if (encodedPacket.dts != AV_NOPTS_VALUE) {
encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base,
out_fmt_ctx -> streams[0] -> time_base);
}
printf("Encoded packet pts %ld\n", encodedPacket.pts);
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket);
if (ret < 0) {
log_averror(ret);
return -1;
}
} else if (ret < 0) {
log_averror(ret);
return -1;
}
}
/* Write Trailer */
av_write_trailer(out_fmt_ctx);
avfilter_graph_free(&filter_graph);
if (dec_ctx)
avcodec_close(dec_ctx);
avformat_close_input(&in_fmt_ctx);
av_frame_free(&pFrame);
av_frame_free(&pFrameFiltered);
if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE))
avio_close(out_fmt_ctx -> pb);
avcodec_close(out_fmt_ctx->streams[0]->codec);
avformat_free_context(out_fmt_ctx);
return 0;
}
The audio file after transcoding is same duration as the input. But its completely noisy. Can somebody tell me what I am doing wrong here!
I have found out where the problem was and it has been resolved.
When the output file was opened in audacity, it was seen that there were unwanted silences inserted in the audio signal. The problem was with the 'number of samples per frame' supplied to the encoder.
Different codecs expect different frame sizes for encoding. And aac encoder expects a size of 1024. This can be seen by observing enc_ctx->frame_size
after execution of avcodec_open2()
.
The filter needs to supply a frame with 1024 number of samples per channel to the encoder.
So in my code, pFrameFiltered
needs to have exactly 1024 number of samples per channel. If the its less than 1024 , the encoder appends zeros to make it to 1024 samples and then encodes it.
This can be solved by either having our own fifo queue or by using the filter available with the ffmpeg audio filters. We need to use a filter asetnsamples=n=1024:p=0
as explained here. So the alteration required was
`string filter_description =
"aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono,asetnsamples=n=1024:p=0";`
Just play around with the value of n
in the filter to understand better. Check the enc_ctx->frame_size
field set by avcodec_open2( ) and set the value of n
appropriately.