硬件平台:mac
FFmpeg版本:5.0.1
来源数据:采样率48000,单通道,FLTP格式,有设置了重采样参数。
目标编码器采样率44100,单通道,s16格式。
重采样之后写入pcm文件正常的。
但重采样数据编码成aac却是不对的,时间变短,明显加速的音频。
采样率同样是48000的情况下正常。
2022.07.27更新:
以上时间变短,加速的问题,后面确认是由于采样率不同的情况下,转换成aac,对应的sample是不一样的,不能直接读取转换,需要通过FIFO队列,去做中间层,然后从队列中读取数据进行编码,才不会有问题,代码还没实现,后续弄好再更新。
/**
* @file record2aac.cpp
* @brief record audio from mac microphone and encode to aac
*
* @copyright Copyright (c) 2022
*/
#include <cstdio>
#include <string>
#include <iostream>
extern "C" {
#include <libavutil/avassert.h>
#include <libavutil/timestamp.h>
#include <libavcodec/avcodec.h>
#include <libavdevice/avdevice.h>
#include <libavformat/avformat.h>
#include <libswresample/swresample.h>
#include <libswscale/swscale.h>
}
static void log_packet(const AVFormatContext *fmt_ctx, const AVPacket *pkt)
{
AVRational *time_base = &fmt_ctx->streams[pkt->stream_index]->time_base;
printf("pts:%s pts_time:%s dts:%s dts_time:%s duration:%s duration_time:%s stream_index:%d\n",
av_ts2str(pkt->pts), av_ts2timestr(pkt->pts, time_base),
av_ts2str(pkt->dts), av_ts2timestr(pkt->dts, time_base),
av_ts2str(pkt->duration), av_ts2timestr(pkt->duration, time_base),
pkt->stream_index);
}
static int get_format_from_sample_fmt(const char **fmt,
enum AVSampleFormat sample_fmt)
{
int i;
struct sample_fmt_entry {
enum AVSampleFormat sample_fmt;
const char *fmt_be;
const char *fmt_le;
} sample_fmt_entries[] = {
{ AV_SAMPLE_FMT_U8, "u8", "u8" },
{ AV_SAMPLE_FMT_S16, "s16be", "s16le" },
{ AV_SAMPLE_FMT_S32, "s32be", "s32le" },
{ AV_SAMPLE_FMT_FLT, "f32be", "f32le" },
{ AV_SAMPLE_FMT_DBL, "f64be", "f64le" },
};
*fmt = nullptr;
for (i = 0; i < FF_ARRAY_ELEMS(sample_fmt_entries); i++) {
struct sample_fmt_entry *entry = &sample_fmt_entries[i];
if (sample_fmt == entry->sample_fmt) {
*fmt = AV_NE(entry->fmt_be, entry->fmt_le);
return 0;
}
}
fprintf(stderr,
"Sample format %s not supported as output format\n",
av_get_sample_fmt_name(sample_fmt));
return AVERROR(EINVAL);
}
/* check that a given sample format is supported by the encoder */
static int check_sample_fmt(const AVCodec *codec, enum AVSampleFormat sample_fmt)
{
const enum AVSampleFormat *p = codec->sample_fmts;
while (*p != AV_SAMPLE_FMT_NONE) {
if (*p == sample_fmt) {
return 1;
}
p++;
}
return 0;
}
/* just pick the highest supported samplerate */
static int select_sample_rate(const AVCodec *codec)
{
const int *p;
int best_samplerate = 0;
if (!codec->supported_samplerates) {
return 44100;
}
p = codec->supported_samplerates;
while (*p) {
if (!best_samplerate || abs(44100 - *p) < abs(44100 - best_samplerate)) {
best_samplerate = *p;
}
p++;
}
return best_samplerate;
}
/* select layout with the highest channel count */
static int select_channel_layout(const AVCodec *codec)
{
const uint64_t *p;
uint64_t best_ch_layout = 0;
int best_nb_channels = 0;
if (!codec->channel_layouts) {
return AV_CH_LAYOUT_STEREO;
}
p = codec->channel_layouts;
while (*p) {
int nb_channels = av_get_channel_layout_nb_channels(*p);
if (nb_channels > best_nb_channels) {
best_ch_layout = *p;
best_nb_channels = nb_channels;
}
p++;
}
return best_ch_layout;
}
static int encode(AVFormatContext *fmt_ctx, AVCodecContext *enc_ctx, AVStream *stream, AVPacket *packet, AVFrame *pframe)
{
int ret;
ret = avcodec_send_frame(enc_ctx, pframe);
if (ret < 0) {
fprintf(stderr, "Error sending the frame to the encoder\n");
return -1;
}
while (ret >= 0) {
ret = avcodec_receive_packet(enc_ctx, packet);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
return -1;
} else if (ret < 0) {
fprintf(stderr, "Error encoding frame\n");
exit(1);
}
log_packet(fmt_ctx, packet);
//av_packet_rescale_ts(packet, enc_ctx->time_base, stream->time_base);
//packet->stream_index = stream->index;
// if (pframe) {
// packet->pts = pframe->pts;
// pframe->pts += 100;
// }
ret = av_interleaved_write_frame(fmt_ctx, packet);
if (ret < 0) {
printf("av_interleaved_write_frame fail err: %s\n", av_err2str(ret));
break;
}
//av_packet_unref(packet);
}
return ret;
}
static AVFrame *alloc_audio_frame(enum AVSampleFormat sample_fmt,
uint64_t channel_layout, int sample_rate,
int nb_samples)
{
AVFrame *frame = av_frame_alloc();
int ret;
if (!frame) {
fprintf(stderr, "Error allocating an audio frame\n");
exit(1);
}
frame->format = sample_fmt;
frame->channel_layout = channel_layout;
frame->sample_rate = sample_rate;
frame->nb_samples = nb_samples;
if (nb_samples) {
ret = av_frame_get_buffer(frame, 0);
if (ret < 0) {
fprintf(stderr, "Error allocating an audio buffer\n");
exit(1);
}
}
return frame;
}
void show_audio_input_ctx(AVStream *stream)
{
// 获取音频参数
AVCodecParameters *params = stream->codecpar;
// 声道数
std::cout << "channels: " << params->channels << std::endl;
// 采样率
std::cout << "sample rate: " << params->sample_rate << std::endl;
// 采样格式
std::cout << "sample format: "
<< av_get_sample_fmt_name((AVSampleFormat)params->format)
<< std::endl;
// 每一个样本的一个声道占用多少个字节
std::cout << "bytes per sample: "
<< av_get_bytes_per_sample((AVSampleFormat)params->format)
<< std::endl;
}
int main(int argc, char **argv)
{
int ret = -1;
int audio_stream_index = -1; // microphone input audio stream index
int src_nb_samples = 0;
int dst_nb_samples = 0;
int max_dst_nb_samples = 0;
// 音频三要素,其中 layout 与 channels 算一个
AVSampleFormat src_sample_fmt;
AVSampleFormat dst_sample_fmt;
int src_sample_rate = 0;
int dst_sample_rate = 0;
int64_t src_ch_layout;
int64_t dst_ch_layout;
int src_nb_channels = 0;
int dst_nb_channels = 0;
uint8_t **src_data = nullptr;
uint8_t **dst_data = nullptr;
int src_linesize, dst_linesize;
const char *dst_filename = NULL;
FILE *dst_file;
int dst_bufsize;
// for input audio
AVCodec *audio_decoder = nullptr;
AVCodecContext *audio_dec_ctx = nullptr;
// for resample
struct SwrContext *swr_ctx = nullptr;
// for audio encoder
AVFormatContext *out_fmt_ctx;
AVCodec *audio_encoder;
AVCodecContext *audio_enc_ctx;
const AVOutputFormat *out_fmt = nullptr;
AVStream *out_stream;
std::string aac_output_filename = "out.aac";
// register device
avdevice_register_all();
AVFormatContext *fmt_ctx = avformat_alloc_context();
// 获取输入格式对象
AVInputFormat *input_fmt = (AVInputFormat *)av_find_input_format("avfoundation");
if (input_fmt != nullptr) {
// 打开设备
ret = avformat_open_input(&fmt_ctx, ":0", input_fmt,
nullptr);
if (ret < 0) {
std::cerr << "avformat open input error: " << av_err2str(ret)
<< std::endl;
return -1;
}
if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
std::cerr << __FUNCTION__ << " avformat_find_stream_info failed."
<< std::endl;
}
// 寻找音频流
for (int i = 0; i < fmt_ctx->nb_streams; i++) {
if (fmt_ctx->streams[i]->codecpar->codec_type ==
AVMEDIA_TYPE_AUDIO) {
audio_stream_index = i;
break;
}
}
if (audio_stream_index == -1) {
std::cerr << __FUNCTION__ << " could not find a audio stream." << std::endl;
return -1;
}
std::cout << __FUNCTION__ << ": find audio stream success. audio_stream_index: " << audio_stream_index << std::endl;;
// 获取 audio stream
AVStream *audio_stream = fmt_ctx->streams[audio_stream_index];
// 根据codec id获取codec
audio_decoder = (AVCodec *)avcodec_find_decoder(
fmt_ctx->streams[audio_stream_index]->codecpar->codec_id);
if (audio_decoder == nullptr) {
std::cerr << __FUNCTION__ << ": can not find a audio codec." << std::endl;
return -1;
}
std::cout << "audio decoder: " << audio_decoder->name
<< ", codec id: " << audio_decoder->id
<< ", codec long name: " << audio_decoder->long_name << std::endl;
// 初始化解码器上下文
audio_dec_ctx =
(AVCodecContext *)avcodec_alloc_context3(audio_decoder);
// 复制参数
avcodec_parameters_to_context(audio_dec_ctx, audio_stream->codecpar);
if (avcodec_open2(audio_dec_ctx, audio_decoder, nullptr) < 0) {
std::cerr << __FUNCTION__ << " can not open a audio codec." << std::endl;
return -1;
}
std::cout << __FUNCTION__ << ": initialize audio decoder success."
<< std::endl;
av_dump_format(fmt_ctx, 0, ":0", 0);
show_audio_input_ctx(audio_stream);
} else {
std::cerr << "av find input format error" << std::endl;
return -1;
}
// init for audio encoder
ret = avformat_alloc_output_context2(&out_fmt_ctx, nullptr, nullptr, aac_output_filename.c_str());
out_fmt = out_fmt_ctx->oformat;
if (ret >= 0) {
/* find the AAC encoder */
//audio_encoder = (AVCodec *) avcodec_find_encoder(out_fmt_ctx->oformat->audio_codec);
//audio_encoder = (AVCodec *) avcodec_find_encoder(AV_CODEC_ID_AAC);
audio_encoder = (AVCodec *) avcodec_find_encoder_by_name("libfdk_aac");
printf("codec name %s, codec long name: %s\n", audio_encoder->name, audio_decoder->long_name);
if (!audio_encoder) {
fprintf(stderr, "audio encoder not found\n");
exit(1);
}
out_stream = avformat_new_stream(out_fmt_ctx, audio_encoder);
if (out_stream == nullptr) {
fprintf(stderr, "Could not allocate audio stream\n");
exit(1);
}
audio_enc_ctx = avcodec_alloc_context3(audio_encoder);
if (!audio_enc_ctx) {
fprintf(stderr, "Could not allocate audio codec context\n");
exit(1);
}
// 比特率让编码器自动处置,如果修改了,可能导致音频时间变短,原因暂时未知
//m_pAudioEncoderCtx->bit_rate = 64000;
/* 检查编码器是否支持对应的 音频格式 */
audio_enc_ctx->sample_fmt = AV_SAMPLE_FMT_S16;
if (!check_sample_fmt(audio_encoder, audio_enc_ctx->sample_fmt)) {
fprintf(stderr, "Encoder does not support sample format %s\n",
av_get_sample_fmt_name(audio_enc_ctx->sample_fmt));
exit(1);
}
//audio_enc_ctx->sample_rate = select_sample_rate(audio_encoder);
audio_enc_ctx->sample_rate = 44100;
//audio_enc_ctx->channel_layout = select_channel_layout(audio_encoder);
audio_enc_ctx->channel_layout = AV_CH_LAYOUT_MONO;
audio_enc_ctx->channels = av_get_channel_layout_nb_channels(audio_enc_ctx->channel_layout);
//audio_enc_ctx->channels = 1;
ret = avio_open(&out_fmt_ctx->pb, aac_output_filename.c_str(), AVIO_FLAG_WRITE);
if (ret < 0) {
fprintf(stderr, "avio_open fail \n");
exit(1);
}
/* open it */
if (avcodec_open2(audio_enc_ctx, audio_encoder, nullptr) < 0) {
fprintf(stderr, "Could not open audio codec\n");
exit(1);
}
ret = avcodec_parameters_from_context(out_stream->codecpar, audio_enc_ctx);
if (ret < 0) {
printf("avcodec_parameters_from_context fail \n");
}
ret = avformat_write_header(out_fmt_ctx, nullptr);
if (ret < 0) {
printf("avformat_write_header fail \n");
}
std::cout << "------ init audio encoder success. ------" << std::endl;
printf("dst sample_rate: %7d\n", audio_enc_ctx->sample_rate);
printf("dst channels: %7d\n", audio_enc_ctx->channels);
printf("dst sample_fmt: %7d\n", audio_enc_ctx->sample_fmt);
printf("dst bit_rate: %7lld\n", audio_enc_ctx->bit_rate);
printf("dst frame size: %7d\n", audio_enc_ctx->frame_size);
std::cout << "-----------------------------------------" << std::endl;
dst_sample_rate = audio_enc_ctx->sample_rate;
dst_sample_fmt = audio_enc_ctx->sample_fmt;
dst_ch_layout = audio_enc_ctx->channel_layout;
dst_nb_channels = audio_enc_ctx->channels;
} else {
fprintf(stderr, "avformat_alloc_output_context2 fail \n");
return ret;
}
FILE *outputFile = fopen("out.pcm", "wb+");
int iFrameCount = 0;
int64_t pts = 0;
AVFrame *frame = av_frame_alloc();
AVPacket *pkt = av_packet_alloc();
AVFrame *out_frame = nullptr;
AVPacket *out_pkt = av_packet_alloc();
while (iFrameCount < 300) {
// 不断采集数据
ret = av_read_frame(fmt_ctx, pkt);
if (ret == 0) {
if (pkt->stream_index == audio_stream_index) {
ret = avcodec_send_packet(audio_dec_ctx, pkt);
while (ret >= 0) {
ret = avcodec_receive_frame(audio_dec_ctx, frame);
if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
break;
} else if (ret < 0) {
std::cerr << "avcodec_receive_frame failed" << std::endl;
return -1;
}
iFrameCount++;
//printf("====== audio frame n:%d coded_n:%d \n", iFrameCount++, frame->coded_picture_number);
ret = av_frame_make_writable(frame);
if (ret < 0) {
fprintf(stderr, "av_frame_make_writable failed, error: %s\n", av_err2str(ret));
exit(-1);
}
// start resampling
// fwrite(frame->data[0], 1,
// frame->nb_samples *
// av_get_bytes_per_sample(static_cast<AVSampleFormat>(frame->format)),
// outputFile);
if (nullptr == swr_ctx) {
/**
* 以下可以使用 swr_alloc、av_opt_set_channel_layout、av_opt_set_int、av_opt_set_sample_fmt
* 等API设置,更加灵活
*/
swr_ctx = swr_alloc_set_opts(nullptr, dst_ch_layout, dst_sample_fmt, dst_sample_rate,
frame->channel_layout, (AVSampleFormat)frame->format,
frame->sample_rate, 0, nullptr);
swr_init(swr_ctx);
}
// 进行音频重采样
int src_nb_sample = frame->nb_samples;
src_sample_rate = frame->sample_rate;
// 为了保持从采样后 dst_nb_samples / dest_sample = src_nb_sample / src_sample_rate
max_dst_nb_samples = dst_nb_samples = av_rescale_rnd(src_nb_sample, dst_sample_rate, src_sample_rate, AV_ROUND_UP);
// 从采样器中会缓存一部分,获取缓存的长度
// int64_t delay = swr_get_delay(swr_ctx, frame->sample_rate);
// dst_nb_samples = av_rescale_rnd(delay + frame->nb_samples, dst_sample_rate, frame->sample_rate,
// AV_ROUND_UP);
if (nullptr == out_frame) {
out_frame = alloc_audio_frame(dst_sample_fmt, dst_ch_layout, dst_sample_rate, dst_nb_samples);
av_frame_make_writable(out_frame);
}
//
// std::cout << "-------- dst audio information. --------" << std::endl;
// printf("dst sample_rate: %d\n", audio_dst_sample_rate);
// printf("dst channels: %d\n", av_get_channel_layout_nb_channels(audio_dst_channel_layout));
// printf("dst sample_fmt: %d\n", audio_dst_sample_fmt);
// printf("dst nb_samples: %d\n", dst_nb_samples);
// std::cout << "-----------------------------------------" << std::endl;
if (dst_nb_samples > max_dst_nb_samples) {
// frame 大小不一致,重新申请内存
printf("dst nb samples: %d,max nb samples: %d\n", dst_nb_samples, max_dst_nb_samples);
av_frame_free(&out_frame);
out_frame = alloc_audio_frame(dst_sample_fmt, dst_ch_layout, dst_sample_rate, dst_nb_samples);
av_frame_make_writable(out_frame);
max_dst_nb_samples = dst_nb_samples;
}
// 重采样,返回的是每个通道的重采样数
//int iConvertSampleCount = swr_convert(swr_ctx, out_frame->data, dst_nb_samples,
// const_cast<const uint8_t **>(frame->data), src_nb_sample);
static int sum = 0;
sum++;
ret = swr_convert_frame(swr_ctx, out_frame, frame);
if (ret < 0) {
printf("swr_convert_frame fail %d", ret);
continue;
}
int iConvertSampleCount = out_frame->nb_samples;
if (iConvertSampleCount > 0) {
printf("dst nb samples: %d,max nb samples %d, convert samples: %d\n", dst_nb_samples, max_dst_nb_samples, iConvertSampleCount);
//printf("src channel layout: %llu, dst channel layout: %llu\n", m_pFrame->channel_layout, audio_dst_channel_layout);
//printf("src rate: %d, dst rate: %d\n", src_sample_rate, audio_dst_sample_rate);
//printf("src fmt: %d, dst fmt: %d\n", m_pFrame->format, audio_dst_sample_fmt);
//printf("iConvertSampleCount: %d\n", iConvertSampleCount);
// 通过channel layout 获取通道数
int dst_nb_channels = av_get_channel_layout_nb_channels(dst_ch_layout);
//printf("dst_nb_channels: %d, m_pOutFrame->channels: %d\n", dst_nb_channels, m_pOutFrame->channels);
// 每帧音频数据量的大小
int iPerSampleSize = av_get_bytes_per_sample(static_cast<AVSampleFormat>(out_frame->format));
// planar 每个声道存储在data[i]中,FLTP
// 不带P,packed,LRLRLR,每个声道间隔存储
// 判断是否planar格式
// 详细参考源码 samplefmt.c
int planar = av_sample_fmt_is_planar(dst_sample_fmt);
// planar 通道数就是channels
// packed 的话,通道数是1,所有通道数据都在data[0]
int planes = planar ? dst_nb_channels : 1;
//printf("=========== planes: %d\n", planes);
// planar 一个数据块大小,就是一个frame的大小
// packed 一个数据块大小,其实是多个通道的数据,也就是*dst_nb_channels
int block_align = av_get_bytes_per_sample(dst_sample_fmt) * (planar ? 1 : dst_nb_channels);
int data_size = iConvertSampleCount * block_align;
//printf("persample size: %d, block size: %d, data size: %d\n", iPerSampleSize, block_align, data_size);
// 多通道数据存放在不同的data数组中,需要循环写入
// pcm文件默认使用packed ,LR间隔存储
for (int i = 0; i < iConvertSampleCount; i++) {
for (int ch = 0; ch < planes; ch++) {
fwrite(out_frame->data[ch] + block_align * i, 1, block_align, outputFile);
}
}
// printf("out_samples: %d, cov samples %d\n",m_pOutFrame->nb_samples, iConvertSampleCount);
//out_frame->nb_samples = iConvertSampleCount;
//pts += iConvertSampleCount;
//m_iPts += dst_nb_samples;
out_frame->pts = pts++;
// out_frame->pts = av_rescale_q(pts, (AVRational) {
// 1, dst_sample_rate
// }, audio_enc_ctx->time_base);
// printf("dst_nb_samples: %d, convert_count: %d, samples_count: %d, sample_rate: %d, time_base den: %d, pts: %d\n",
// dst_nb_samples, iConvertSampleCount, pts, audio_enc_ctx->sample_rate, audio_enc_ctx->time_base.den, out_frame->pts);
// pts += iConvertSampleCount;
encode(out_fmt_ctx, audio_enc_ctx, out_stream, out_pkt, out_frame);
while (swr_convert_frame(swr_ctx, out_frame, nullptr) >= 0) {
if (out_frame->nb_samples <= 0) {
break;
}
printf("还有余量 %d", out_frame->nb_samples);
out_frame->pts = pts++;
encode(out_fmt_ctx, audio_enc_ctx, out_stream, out_pkt, out_frame);
}
}
}
}
av_packet_unref(pkt);
} else {
// 资源还没准备好,重新读取
if (ret == AVERROR(EAGAIN)) {
continue;
}
printf("\n====== av_read_frame failed, ret: %s\n",
av_err2str(ret));
break;
}
}
encode(out_fmt_ctx, audio_enc_ctx, out_stream, out_pkt, nullptr);
ret = av_write_trailer(out_fmt_ctx);
if (ret < 0) {
printf("av_write_trailer fail \n");
}
const char *fmt;
get_format_from_sample_fmt(&fmt, audio_enc_ctx->sample_fmt);
fprintf(stderr,
"Resampling succeeded. Play the output file with the command:\n"
"ffplay -f %s -channel_layout %" PRId64 " -channels %d -ar %d %s\n",
fmt, audio_enc_ctx->channel_layout,
audio_enc_ctx->channels, audio_enc_ctx->sample_rate,
"out.pcm");
// 关闭文件
fclose(outputFile);
return 0;
}