音频转码 via FFmpeg
转码(transcoding)其实就是把音频从一种编码转换成另一种编码的过程,如 MP3 → WMA。基本流程如下图:
FFmpeg 简介
FFmpeg 是一套可以用来记录、转换数字音频、视频,并能将其转化为流的开源计算机程序。采用 LGPL 或 GPL 许可证。它提供了录制、转换以及流化音视频的完整解决方案。它包含了非常先进的音频 / 视频编解码库 libavcodec,为了保证高可移植性和编解码质量,libavcodec 里很多 code 都是从头开发的。
FFmpeg 在 Linux 平台下开发,但它同样也可以在其它操作系统环境中编译运行,包括 Windows、Mac OS X 等。这个项目最早由 Fabrice Bellard 发起,2004 年至 2015 年间由 Michael Niedermayer 主要负责维护。许多 FFmpeg 的开发人员都来自 MPlayer 项目,而且当前 FFmpeg 也是放在 MPlayer 项目组的服务器上。项目的名称来自 MPEG 视频编码标准,前面的 “FF” 代表 “Fast Forward”。
FFmpeg 命令行转码
FFmpeg 提供了命令行的方式对音频进行转码,so easy:
>ffmpeg.exe -i d:\test.mp3 d:\test.aac
[mp3 @ 0056d0c0] Estimating duration from bitrate, this may be inaccurate
Input #0, mp3, from 'd:\test.mp3':
Duration: 00:00:11.47, start: 0.000000, bitrate: 192 kb/s
Stream #0:0: Audio: mp3, 44100 Hz, stereo, s16p, 192 kb/s
Stream mapping:
Stream #0:0 -> #0:0 (mp3 (native) -> aac (native))
Output #0, adts, to 'd:\test.aac':
Metadata:
encoder : Lavf57.71.100
Stream #0:0: Audio: aac (LC), 44100 Hz, stereo, fltp, 128 kb/s
Metadata:
encoder : Lavc57.89.100 aac
size=183kB time=00:00:11.47 bitrate=130.5kbits/s speed=9.93x
video:0kB audio:179kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.887142%
可是我们就是爱折腾 ★~★
FFmpeg API 转码
Transcoding 流程图
Transcoding 代码
以下是整个转码过程的概要代码,略去各个函数的具体实现和资源释放:
本文中的代码基于 FFmpeg 4.1。
int aud_stream_idx = open_input_file(in_file, AVMEDIA_TYPE_AUDIO, &in_fmt_ctx, &dec_ctx);
hr = open_output_audio_file(out_file, dec_ctx, &out_fmt_ctx, &enc_ctx);
hr = init_resampler(dec_ctx, enc_ctx, &resample_ctx);
hr = init_fifo(&fifo, enc_ctx);
hr = avformat_write_header(out_fmt_ctx, NULL);
while (true) {
int finished = 0;
hr = audio_transcode(
in_fmt_ctx, dec_ctx,
out_fmt_ctx, enc_ctx,
fifo, resample_ctx, 0, &finished );
GOTO_IF_FAILED(hr);
if (finished)
break;
}
hr = av_write_trailer(out_fmt_ctx);
open_input_file 函数
打开文件获得一个 输入端 的 AVFormatContext 和 AVCodecContext。
int open_input_file(
const char *file_name,
AVMediaType stream_type,
AVFormatContext **in_fmt_ctx,
AVCodecContext **dec_ctx)
{
AVCodecContext *avctx = NULL;
AVCodec *decoder = NULL;
int hr = avformat_open_input(in_fmt_ctx, file_name, NULL, NULL);
hr = avformat_find_stream_info(*in_fmt_ctx, NULL);
int stream_index = -1;
for (unsigned int i = 0; i < (*in_fmt_ctx)->nb_streams; ++i) {
if ((*in_fmt_ctx)->streams[i]->codecpar->codec_type == stream_type) {
stream_index = i;
break;
}
}
AVStream* stream = (*in_fmt_ctx)->streams[stream_index];
decoder = avcodec_find_decoder(stream->codecpar->codec_id);
avctx = avcodec_alloc_context3(decoder);
hr = avcodec_parameters_to_context(avctx, stream->codecpar);
if (stream_type == AVMEDIA_TYPE_VIDEO)
avctx->framerate = av_guess_frame_rate(*in_fmt_ctx, stream, NULL);
hr = avcodec_open2(avctx, decoder, NULL);
*dec_ctx = avctx;
return stream_index;
}
open_output_audio_file 函数
打开文件获得一个 输出端 的 AVFormatContext 和 AVCodecContext。
int open_output_audio_file(
const char *file_name,
AVCodecContext *dec_ctx,
AVFormatContext **out_fmt_ctx,
AVCodecContext **enc_ctx)
{
RETURN_IF_NULL(enc_ctx);
int hr = -1;
AVCodecContext *codec_ctx = NULL;
hr = open_output_file(file_name, dec_ctx->codec_type, out_fmt_ctx, &codec_ctx);
RETURN_IF_FAILED(hr);
hr = init_audio_encoder(dec_ctx->sample_rate, *out_fmt_ctx, 0, codec_ctx);
GOTO_LABEL_IF_FAILED(hr, OnErr);
*enc_ctx = codec_ctx;
return 0;
OnErr:
avcodec_free_context(&codec_ctx);
avio_closep(&(*out_fmt_ctx)->pb);
avformat_free_context(*out_fmt_ctx);
*out_fmt_ctx = NULL;
*enc_ctx = NULL;
return hr;
}
open_output_file 函数
通过文件后缀名 guess 一个最适合的编码器。
int open_output_file(
const char *file_name,
AVMediaType stream_type,
AVFormatContext **out_fmt_ctx,
AVCodecContext **enc_ctx )
{
RETURN_IF_NULL(file_name);
RETURN_IF_NULL(out_fmt_ctx);
RETURN_IF_NULL(enc_ctx);
int hr = -1;
AVIOContext *output_io_ctx = NULL;
/** Open the output file to write to it. */
hr = avio_open(&output_io_ctx, file_name, AVIO_FLAG_WRITE);
RETURN_IF_FAILED(hr);
/** Create a new format context for the output container format. */
*out_fmt_ctx = avformat_alloc_context();
RETURN_IF_NULL(*out_fmt_ctx);
/** Associate the output file (pointer) with the container format context. */
(*out_fmt_ctx)->pb = output_io_ctx;
/** Guess the desired container format based on the file extension. */
(*out_fmt_ctx)->oformat = av_guess_format(NULL, file_name, NULL);
GOTO_LABEL_IF_NULL((*out_fmt_ctx)->oformat, OnErr);
char*& url = (*out_fmt_ctx)->url;
if (NULL == url)
url = av_strdup(file_name);
/** Find the encoder to be used by its name. */
AVCodecID out_codec_id = AV_CODEC_ID_NONE;
switch (stream_type) {
case AVMEDIA_TYPE_AUDIO:
out_codec_id = (*out_fmt_ctx)->oformat->audio_codec;
break;
case AVMEDIA_TYPE_VIDEO:
out_codec_id = (*out_fmt_ctx)->oformat->video_codec;
break;
}
int stream_idx = add_stream_and_alloc_enc(out_codec_id, *out_fmt_ctx, enc_ctx);
GOTO_LABEL_IF_FALSE(stream_idx >= 0, OnErr);
return 0;
OnErr:
avio_closep(&(*out_fmt_ctx)->pb);
avformat_free_context(*out_fmt_ctx);
*out_fmt_ctx = NULL;
*enc_ctx = NULL;
return hr;
}
init_audio_encoder 函数
初始化音频的一些基本参数如:声道、采样率、比特率、时间戳基准等。
int init_audio_encoder(
int sample_rate,
const AVFormatContext* out_fmt_ctx,
unsigned int audio_stream_idx,
AVCodecContext *codec_ctx,
AVSampleFormat sample_fmt = AV_SAMPLE_FMT_NONE,
uint64_t channel_layout = AV_CH_FRONT_LEFT | AV_CH_FRONT_RIGHT,
int64_t bit_rate = 64000 )
{
int hr = -1;
RETURN_IF_NULL(codec_ctx);
RETURN_IF_NULL(out_fmt_ctx);
RETURN_IF_FALSE(audio_stream_idx < out_fmt_ctx->nb_streams);
/**
* Set the basic encoder parameters.
* The input file's sample rate is used to avoid a sample rate conversion.
*/
codec_ctx->channel_layout = channel_layout;
codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout);
codec_ctx->sample_rate = sample_rate;
codec_ctx->sample_fmt = (sample_fmt != AV_SAMPLE_FMT_NONE) ? sample_fmt : codec_ctx->codec->sample_fmts[0];
codec_ctx->bit_rate = bit_rate;
/** Allow the use of the experimental encoder */
codec_ctx->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
codec_ctx->time_base.den = sample_rate;
codec_ctx->time_base.num = 1;
/**
* Some container formats (like MP4) require global headers to be present
* Mark the encoder so that it behaves accordingly.
*/
if (out_fmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
AVStream* stream = out_fmt_ctx->streams[audio_stream_idx];
stream->time_base = codec_ctx->time_base;
/** Open the encoder for the audio stream to use it later. */
hr = avcodec_open2(codec_ctx, codec_ctx->codec, NULL);
RETURN_IF_FAILED(hr);
hr = avcodec_parameters_from_context(stream->codecpar, codec_ctx);
RETURN_IF_FAILED(hr);
return 0;
}
audio_transcode 函数
解码后的音频数据不能直接编码,而是要经过一个 FIFO(先入先出队列),这是因为音频编解码的 frame 大小往往是不一样的(视频 frame 大小狭义上是一样的,但格式(RGB,YUV 等)可能不一样)。
int audio_transcode(
AVFormatContext* in_fmt_ctx,
AVCodecContext* dec_ctx,
AVFormatContext* out_fmt_ctx,
AVCodecContext* enc_ctx,
AVAudioFifo* fifo,
SwrContext* resample_ctx,
int aud_stream_index,
int* finished,
bool interleaved = false,
bool init_pts = false)
{
int hr = -1;
audio_base_info out_aud_info(enc_ctx);
hr = decode_a_frame(in_fmt_ctx, dec_ctx, &out_aud_info, fifo, resample_ctx, aud_stream_index, finished);
RETURN_IF_FAILED(hr);
/**
* If we have enough samples for the encoder, we encode them.
* At the end of the file, we pass the remaining samples to the encoder.
*/
while (av_audio_fifo_size(fifo) >= enc_ctx->frame_size ||
(*finished && av_audio_fifo_size(fifo) > 0)) {
/**
* Take one frame worth of audio samples from the FIFO buffer,
* encode it and write it to the output file.
*/
hr = load_encode_and_write(fifo, out_fmt_ctx, enc_ctx, &out_aud_info, interleaved, init_pts);
RETURN_IF_FAILED(hr);
}
/**
* If we are at the end of the input file and have encoded
* all remaining samples, we can exit this loop and finish.
*/
if (*finished)
flush_encoder(out_fmt_ctx, enc_ctx, interleaved, init_pts);
return 0;
}
decode_a_frame 函数
此处只是解码的外层 wrapper。
int decode_a_frame(
AVFormatContext* in_fmt_ctx,
AVCodecContext* dec_ctx,
audio_base_info* out_aud_info,
AVAudioFifo* fifo,
SwrContext* resample_ctx,
int audio_stream_index,
int* finished)
{
int hr = AVERROR_EXIT;
/* Make sure that there is one frame worth of samples in the FIFO
* buffer so that the encoder can do its work.
* Since the decoder's and the encoder's frame size may differ, we
* need to FIFO buffer to store as many frames worth of input samples
* that they make up at least one frame worth of output samples. */
while (av_audio_fifo_size(fifo) < out_aud_info->frame_size) {
/* Decode one frame worth of audio samples, convert it to the
* output sample format and put it into the FIFO buffer. */
hr = read_decode_convert_and_store(fifo, in_fmt_ctx, dec_ctx, out_aud_info,
resample_ctx, audio_stream_index, finished);
RETURN_IF_FAILED(hr);
if (*finished)
break;
}
return hr;
}
read_decode_convert_and_store 函数
继续 wrapper。
int read_decode_convert_and_store(
AVAudioFifo *fifo,
AVFormatContext *in_fmt_ctx,
AVCodecContext *dec_ctx,
audio_base_info* out_aud_info,
SwrContext *resampler_ctx,
int audio_stream_index,
int *finished)
{
RETURN_IF_NULL(finished);
/** Temporary storage of the input samples of the frame read from the file. */
std::vector<AVFrame*> decoded_frames;
int hr = AVERROR_EXIT;
/** Decode one frame worth of audio samples. */
hr = decode_av_frame(in_fmt_ctx, dec_ctx, audio_stream_index, decoded_frames, finished);
/**
* If we are at the end of the file and there are no more samples
* in the decoder which are delayed, we are actually finished.
* This must not be treated as an error.
*/
if (*finished && decoded_frames.empty()) {
hr = 0;
goto RESOURCE_FREE;
}
if (FAILED(hr) && decoded_frames.empty())
GOTO_IF_FAILED(hr);
/** If there is decoded data, convert and store it */
for (size_t i = 0; i < decoded_frames.size(); ++i) {
AVFrame* frame = decoded_frames[i];
hr = resample_and_store(frame, dec_ctx->sample_rate, out_aud_info, resampler_ctx, fifo);
GOTO_IF_FAILED(hr);
}
hr = 0;
RESOURCE_FREE:
for (size_t i = 0; i < decoded_frames.size(); ++i)
av_frame_free(&decoded_frames[i]);
return hr;
}
decode_av_frame 函数
终于找到你了,亲爱的解码函数,不过她其实也是 FFmpeg 的终极 wrapper -_-!
注意:此处已经抛弃了 legacy 的 avcodec_decode_audio4,而是使用 avcodec_send_packet 和 avcodec_receive_frame,具体请参考 官方文档。
int decode_av_frame(
AVFormatContext *in_fmt_ctx,
AVCodecContext *dec_ctx,
int stream_index, // -1 means any stream
std::vector<AVFrame*>& frames,
int *finished)
{
RETURN_IF_NULL(in_fmt_ctx);
RETURN_IF_NULL(dec_ctx);
RETURN_IF_NULL(finished);
*finished = 0;
AVFrame *frame = NULL;
/** Packet used for temporary storage. */
AVPacket in_pkt;
int hr = -1;
init_packet(&in_pkt);
while (true) {
/** Read one frame from the input file into a temporary packet. */
hr = av_read_frame(in_fmt_ctx, &in_pkt);
if (FAILED(hr)) {
/** If we are at the end of the file, flush the decoder below. */
if (hr == AVERROR_EOF)
*finished = 1;
else
RETURN_IF_FAILED(hr);
}
else if ((stream_index >= 0) && (in_pkt.stream_index != stream_index))
continue;
else
av_packet_rescale_ts(&in_pkt,
in_fmt_ctx->streams[in_pkt.stream_index]->time_base,
dec_ctx->time_base);
hr = avcodec_send_packet(dec_ctx, *finished ? NULL : &in_pkt);
if (SUCCEEDED(hr) || (hr == AVERROR(EAGAIN))) {
while (true) {
/** Initialize temporary storage for one input frame. */
frame = av_frame_alloc();
GOTO_IF_NULL(frame);
hr = avcodec_receive_frame(dec_ctx, frame);
if (SUCCEEDED(hr)) {
if (NULL != dec_ctx->hw_device_ctx) { // decoded by hardware
AVFrame* sw_frame = HW_dec_helper::convert_frame(frame);
GOTO_IF_NULL(sw_frame);
frames.push_back(sw_frame);
av_frame_free(&frame);
}
else
frames.push_back(frame);
}
else if (hr == AVERROR_EOF) {
*finished = 1;
break;
}
else if (hr == AVERROR(EAGAIN)) // need more packets
break;
else
GOTO_IF_FAILED(hr);
}
}
else if (hr == AVERROR_EOF)
*finished = 1;
else
GOTO_IF_FAILED(hr);
if (*finished || !frames.empty())
break;
}
hr = 0;
RESOURCE_FREE:
if (NULL != frame)
av_frame_free(&frame);
av_packet_unref(&in_pkt);
return hr;
}
load_encode_and_write 函数
无止境的 wrapper,从 fifo 队列中读取目标大小的数据然后进行编码并写到文件中。
int load_encode_and_write(
AVAudioFifo* fifo,
AVFormatContext* out_fmt_ctx,
AVCodecContext* enc_ctx,
audio_base_info* out_aud_info,
bool interleaved,
bool init_pts = true )
{
int hr = -1;
/** Temporary storage of the output samples of the frame written to the file. */
AVFrame *output_frame = NULL;
hr = read_samples_from_fifo(fifo, out_aud_info, &output_frame);
GOTO_IF_FAILED(hr);
/** Encode one frame worth of audio samples. */
int data_written = 0;
hr = encode_av_frame(output_frame, out_fmt_ctx, enc_ctx,
&data_written, interleaved, init_pts);
GOTO_IF_FAILED(hr);
hr = 0;
RESOURCE_FREE:
if (NULL != output_frame)
av_frame_free(&output_frame);
return hr;
}
read_samples_from_fifo 函数
顾名思义,不解释。
int read_samples_from_fifo(AVAudioFifo* fifo, audio_base_info* out_aud_info, AVFrame** output_frame)
{
RETURN_IF_NULL(fifo);
RETURN_IF_NULL(out_aud_info);
RETURN_IF_NULL(output_frame);
int hr = -1;
/**
* Use the maximum number of possible samples per frame.
* If there is less than the maximum possible frame size in the FIFO
* buffer use this number. Otherwise, use the maximum possible frame size
*/
int fifo_size = av_audio_fifo_size(fifo);
const int frame_size = FFMIN(fifo_size, out_aud_info->frame_size);
/** Initialize temporary storage for one output frame. */
hr = init_audio_frame(output_frame, out_aud_info);
RETURN_IF_FAILED(hr);
/**
* Read as many samples from the FIFO buffer as required to fill the frame.
* The samples are stored in the frame temporarily.
*/
int samples_read = av_audio_fifo_read(fifo, (void**)((*output_frame)->data), frame_size);
RETURN_IF_FALSE(samples_read == frame_size);
return 0;
}
av_encode_frame 函数
又见亲爱的。
int encode_av_frame(
AVFrame *frame,
AVFormatContext *out_fmt_ctx,
AVCodecContext *enc_ctx,
int* data_written,
bool interleaved,
bool init_pts)
{
// frame can be NULL which means to flush
RETURN_IF_NULL(out_fmt_ctx);
RETURN_IF_NULL(enc_ctx);
RETURN_IF_NULL(data_written);
*data_written = 0;
int hr = -1;
/* sometimes the video input (e.g. webcam) cannot match the desired frame rate,
* then need to duplicate to several frames. */
int duplicate_frames = 1;
bool is_video = false;
if (NULL != frame && init_pts) {
if (enc_ctx->codec_type == AVMEDIA_TYPE_AUDIO) {
/** Set a timestamp based on the sample rate for the container. */
frame->pts = g_ttl_a_samples;
g_ttl_a_samples += frame->nb_samples;
}
else {
is_video = true;
// For fixed-fps content, timebase should be 1/framerate and timestamp increments should be identically 1.
if (g_ttl_v_frames > 0) {
int64_t frame_gap = av_gettime_relative() - g_last_v_frame_ts;
double duration = av_q2d(enc_ctx->time_base) * AV_TIME_BASE;
duplicate_frames = max(1, (int)ceil(frame_gap / duration));
}
g_last_v_frame_ts = av_gettime_relative();
}
}
int stream_idx = 0;
for (unsigned int i = 0; i < out_fmt_ctx->nb_streams; ++i) {
if (out_fmt_ctx->streams[i]->codecpar->codec_type == enc_ctx->codec_type) {
stream_idx = i;
break;
}
}
std::vector<AVPacket*> packets;
AVPacket* output_packet = NULL;
for (int i = 0; i < duplicate_frames; ++i) {
if (is_video && init_pts)
frame->pts = g_ttl_v_frames++;
hr = avcodec_send_frame(enc_ctx, frame);
if (SUCCEEDED(hr) || (hr == AVERROR(EAGAIN))) {
while (true) {
/** Packet used for temporary storage. */
output_packet = new AVPacket();
init_packet(output_packet);
hr = avcodec_receive_packet(enc_ctx, output_packet);
if (SUCCEEDED(hr)) {
output_packet->stream_index = stream_idx;
packets.push_back(output_packet);
}
else if (hr == AVERROR(EAGAIN)) // need more input frames
break;
else if (hr == AVERROR_EOF)
break;
else
GOTO_IF_FAILED(hr);
}
}
else if (hr != AVERROR_EOF)
GOTO_IF_FAILED(hr);
}
for (size_t i = 0; i < packets.size(); ++i) {
// set pts based on stream time base.
AVRational stream_tb = get_stream_time_base(out_fmt_ctx, enc_ctx->codec_type);
AVPacket* packet = packets[i];
switch (enc_ctx->codec_type) {
case AVMEDIA_TYPE_VIDEO:
if (init_pts) {
packet->duration = av_rescale_q(1, enc_ctx->time_base, stream_tb);
packet->pts *= packet->duration;
packet->dts *= packet->duration;
}
else
av_packet_rescale_ts(packet, enc_ctx->time_base, stream_tb);
break;
case AVMEDIA_TYPE_AUDIO:
av_packet_rescale_ts(packet, enc_ctx->time_base, stream_tb);
break;
}
/** Write one frame from the temporary packet to the output file. */
if (interleaved)
hr = av_interleaved_write_frame(out_fmt_ctx, packet);
else
hr = av_write_frame(out_fmt_ctx, packet);
GOTO_IF_FAILED(hr);
*data_written = 1;
}
hr = 0;
RESOURCE_FREE:
for (size_t i = 0; i < packets.size(); ++i) {
av_packet_unref(packets[i]);
delete packets[i];
}
if (NULL != output_packet) {
av_packet_unref(output_packet);
delete output_packet;
}
return hr;
}
flush_encoder 函数
终于结束了,小屁屁要擦干净。
int flush_encoder(
AVFormatContext* format_ctx,
AVCodecContext* codec_ctx,
bool interleaved,
bool init_pts)
{
if (!(codec_ctx->codec->capabilities & AV_CODEC_CAP_DELAY))
return 0;
int data_written = 0;
/** Flush the encoder as it may have delayed frames. */
do {
int hr = encode_av_frame(NULL, format_ctx, codec_ctx, &data_written, interleaved, init_pts);
RETURN_IF_FAILED(hr);
} while (data_written);
return 0;
}
其他框架的转码
– EOF –