【FFmpeg】录音并编码aac

c1rew
已于 2022-07-27 08:45:57 修改
阅读量1.2k
点赞数
分类专栏：音视频 FFmpeg 文章标签：音视频
于 2022-05-31 22:57:51 首次发布
本文链接：https://blog.csdn.net/kalenzh/article/details/125074965
版权
音视频同时被 2 个专栏收录
1 篇文章 0 订阅
订阅专栏
FFmpeg
1 篇文章 0 订阅
订阅专栏
硬件平台：mac
FFmpeg版本：5.0.1
来源数据：采样率48000，单通道，FLTP格式，有设置了重采样参数。
目标编码器采样率44100，单通道，s16格式。
重采样之后写入pcm文件正常的。
但重采样数据编码成aac却是不对的，时间变短，明显加速的音频。
采样率同样是48000的情况下正常。
2022.07.27更新：
以上时间变短，加速的问题，后面确认是由于采样率不同的情况下，转换成aac，对应的sample是不一样的，不能直接读取转换，需要通过FIFO队列，去做中间层，然后从队列中读取数据进行编码，才不会有问题，代码还没实现，后续弄好再更新。
/**
 * @file record2aac.cpp
 * @brief record audio from mac microphone and encode to aac
 *
 * @copyright Copyright (c) 2022
 */

#include <cstdio>
#include <string>
#include <iostream>

extern "C" {
#include <libavutil/avassert.h>
#include <libavutil/timestamp.h>
#include <libavcodec/avcodec.h>
#include <libavdevice/avdevice.h>
#include <libavformat/avformat.h>
#include <libswresample/swresample.h>
#include <libswscale/swscale.h>
}

static void log_packet(const AVFormatContext *fmt_ctx, const AVPacket *pkt)
{
    AVRational *time_base = &fmt_ctx->streams[pkt->stream_index]->time_base;
    printf("pts:%s pts_time:%s dts:%s dts_time:%s duration:%s duration_time:%s stream_index:%d\n",
           av_ts2str(pkt->pts), av_ts2timestr(pkt->pts, time_base),
           av_ts2str(pkt->dts), av_ts2timestr(pkt->dts, time_base),
           av_ts2str(pkt->duration), av_ts2timestr(pkt->duration, time_base),
           pkt->stream_index);
}

static int get_format_from_sample_fmt(const char **fmt,
                                      enum AVSampleFormat sample_fmt)
{
    int i;
    struct sample_fmt_entry {
        enum AVSampleFormat sample_fmt;
        const char *fmt_be;
        const char *fmt_le;
    } sample_fmt_entries[] = {
        { AV_SAMPLE_FMT_U8,  "u8",    "u8"    },
        { AV_SAMPLE_FMT_S16, "s16be", "s16le" },
        { AV_SAMPLE_FMT_S32, "s32be", "s32le" },
        { AV_SAMPLE_FMT_FLT, "f32be", "f32le" },
        { AV_SAMPLE_FMT_DBL, "f64be", "f64le" },
    };
    *fmt = nullptr;

    for (i = 0; i < FF_ARRAY_ELEMS(sample_fmt_entries); i++) {
        struct sample_fmt_entry *entry = &sample_fmt_entries[i];

        if (sample_fmt == entry->sample_fmt) {
            *fmt = AV_NE(entry->fmt_be, entry->fmt_le);
            return 0;
        }
    }

    fprintf(stderr,
            "Sample format %s not supported as output format\n",
            av_get_sample_fmt_name(sample_fmt));
    return AVERROR(EINVAL);
}


/* check that a given sample format is supported by the encoder */
static int check_sample_fmt(const AVCodec *codec, enum AVSampleFormat sample_fmt)
{
    const enum AVSampleFormat *p = codec->sample_fmts;

    while (*p != AV_SAMPLE_FMT_NONE) {
        if (*p == sample_fmt) {
            return 1;
        }

        p++;
    }

    return 0;
}

/* just pick the highest supported samplerate */
static int select_sample_rate(const AVCodec *codec)
{
    const int *p;
    int best_samplerate = 0;

    if (!codec->supported_samplerates) {
        return 44100;
    }

    p = codec->supported_samplerates;

    while (*p) {
        if (!best_samplerate || abs(44100 - *p) < abs(44100 - best_samplerate)) {
            best_samplerate = *p;
        }

        p++;
    }

    return best_samplerate;
}

/* select layout with the highest channel count */
static int select_channel_layout(const AVCodec *codec)
{
    const uint64_t *p;
    uint64_t best_ch_layout = 0;
    int best_nb_channels = 0;

    if (!codec->channel_layouts) {
        return AV_CH_LAYOUT_STEREO;
    }

    p = codec->channel_layouts;

    while (*p) {
        int nb_channels = av_get_channel_layout_nb_channels(*p);

        if (nb_channels > best_nb_channels) {
            best_ch_layout = *p;
            best_nb_channels = nb_channels;
        }

        p++;
    }

    return best_ch_layout;
}

static int encode(AVFormatContext *fmt_ctx, AVCodecContext *enc_ctx, AVStream *stream, AVPacket *packet, AVFrame *pframe)
{
    int ret;
    ret = avcodec_send_frame(enc_ctx, pframe);

    if (ret < 0) {
        fprintf(stderr, "Error sending the frame to the encoder\n");
        return -1;
    }

    while (ret >= 0) {
        ret = avcodec_receive_packet(enc_ctx, packet);

        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return -1;
        } else if (ret < 0) {
            fprintf(stderr, "Error encoding frame\n");
            exit(1);
        }

        log_packet(fmt_ctx, packet);
        //av_packet_rescale_ts(packet, enc_ctx->time_base, stream->time_base);
        //packet->stream_index = stream->index;
        // if (pframe) {
        //     packet->pts = pframe->pts;
        //     pframe->pts += 100;
        // }
        ret = av_interleaved_write_frame(fmt_ctx, packet);

        if (ret < 0) {
            printf("av_interleaved_write_frame fail err: %s\n", av_err2str(ret));
            break;
        }

        //av_packet_unref(packet);
    }

    return ret;
}

static AVFrame *alloc_audio_frame(enum AVSampleFormat sample_fmt,
                                  uint64_t channel_layout, int sample_rate,
                                  int nb_samples)
{
    AVFrame *frame = av_frame_alloc();
    int ret;

    if (!frame) {
        fprintf(stderr, "Error allocating an audio frame\n");
        exit(1);
    }

    frame->format = sample_fmt;
    frame->channel_layout = channel_layout;
    frame->sample_rate = sample_rate;
    frame->nb_samples = nb_samples;

    if (nb_samples) {
        ret = av_frame_get_buffer(frame, 0);

        if (ret < 0) {
            fprintf(stderr, "Error allocating an audio buffer\n");
            exit(1);
        }
    }

    return frame;
}

void show_audio_input_ctx(AVStream *stream)
{
    // 获取音频参数
    AVCodecParameters *params = stream->codecpar;
    // 声道数
    std::cout << "channels: " << params->channels << std::endl;
    // 采样率
    std::cout << "sample rate: " << params->sample_rate << std::endl;
    // 采样格式
    std::cout << "sample format: "
              << av_get_sample_fmt_name((AVSampleFormat)params->format)
              << std::endl;
    // 每一个样本的一个声道占用多少个字节
    std::cout << "bytes per sample: "
              << av_get_bytes_per_sample((AVSampleFormat)params->format)
              << std::endl;
}

int main(int argc, char **argv)
{
    int ret = -1;
    int audio_stream_index = -1;  // microphone input audio stream index
    int src_nb_samples = 0;
    int dst_nb_samples = 0;
    int max_dst_nb_samples = 0;
    // 音频三要素，其中 layout 与 channels 算一个
    AVSampleFormat src_sample_fmt;
    AVSampleFormat dst_sample_fmt;
    int src_sample_rate = 0;
    int dst_sample_rate = 0;
    int64_t src_ch_layout;
    int64_t dst_ch_layout;
    int src_nb_channels = 0;
    int dst_nb_channels = 0;
    uint8_t **src_data = nullptr;
    uint8_t **dst_data = nullptr;
    int src_linesize, dst_linesize;
    const char *dst_filename = NULL;
    FILE *dst_file;
    int dst_bufsize;
    // for input audio
    AVCodec *audio_decoder = nullptr;
    AVCodecContext *audio_dec_ctx = nullptr;
    // for resample
    struct SwrContext *swr_ctx = nullptr;
    // for audio encoder
    AVFormatContext *out_fmt_ctx;
    AVCodec *audio_encoder;
    AVCodecContext *audio_enc_ctx;
    const AVOutputFormat *out_fmt = nullptr;
    AVStream *out_stream;
    std::string aac_output_filename = "out.aac";
    // register device
    avdevice_register_all();
    AVFormatContext *fmt_ctx = avformat_alloc_context();
    // 获取输入格式对象
    AVInputFormat *input_fmt = (AVInputFormat *)av_find_input_format("avfoundation");

    if (input_fmt != nullptr) {
        // 打开设备
        ret = avformat_open_input(&fmt_ctx, ":0", input_fmt,
                                  nullptr);

        if (ret < 0) {
            std::cerr << "avformat open input error: " << av_err2str(ret)
                      << std::endl;
            return -1;
        }

        if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
            std::cerr << __FUNCTION__ << " avformat_find_stream_info failed."
                      << std::endl;
        }

        // 寻找音频流
        for (int i = 0; i < fmt_ctx->nb_streams; i++) {
            if (fmt_ctx->streams[i]->codecpar->codec_type ==
                AVMEDIA_TYPE_AUDIO) {
                audio_stream_index = i;
                break;
            }
        }

        if (audio_stream_index == -1) {
            std::cerr << __FUNCTION__ << " could not find a audio stream." << std::endl;
            return -1;
        }

        std::cout << __FUNCTION__ << ": find audio stream success. audio_stream_index: " << audio_stream_index << std::endl;;
        // 获取 audio stream
        AVStream *audio_stream = fmt_ctx->streams[audio_stream_index];
        // 根据codec id获取codec
        audio_decoder = (AVCodec *)avcodec_find_decoder(
                            fmt_ctx->streams[audio_stream_index]->codecpar->codec_id);

        if (audio_decoder == nullptr) {
            std::cerr << __FUNCTION__ << ": can not find a audio codec." << std::endl;
            return -1;
        }

        std::cout << "audio decoder: " << audio_decoder->name
                  << ", codec id: " << audio_decoder->id
                  << ", codec long name: " << audio_decoder->long_name << std::endl;
        // 初始化解码器上下文
        audio_dec_ctx =
            (AVCodecContext *)avcodec_alloc_context3(audio_decoder);
        // 复制参数
        avcodec_parameters_to_context(audio_dec_ctx, audio_stream->codecpar);

        if (avcodec_open2(audio_dec_ctx, audio_decoder, nullptr) < 0) {
            std::cerr << __FUNCTION__ << " can not open a audio codec." << std::endl;
            return -1;
        }

        std::cout << __FUNCTION__ << ": initialize audio decoder success."
                  << std::endl;
        av_dump_format(fmt_ctx, 0, ":0", 0);
        show_audio_input_ctx(audio_stream);
    } else {
        std::cerr << "av find input format error" << std::endl;
        return -1;
    }

    // init for audio encoder
    ret = avformat_alloc_output_context2(&out_fmt_ctx, nullptr, nullptr, aac_output_filename.c_str());
    out_fmt = out_fmt_ctx->oformat;

    if (ret >= 0) {
        /* find the AAC encoder */
        //audio_encoder = (AVCodec *) avcodec_find_encoder(out_fmt_ctx->oformat->audio_codec);
        //audio_encoder = (AVCodec *) avcodec_find_encoder(AV_CODEC_ID_AAC);
        audio_encoder = (AVCodec *) avcodec_find_encoder_by_name("libfdk_aac");
        printf("codec name %s, codec long name: %s\n", audio_encoder->name, audio_decoder->long_name);

        if (!audio_encoder) {
            fprintf(stderr, "audio encoder not found\n");
            exit(1);
        }

        out_stream = avformat_new_stream(out_fmt_ctx, audio_encoder);

        if (out_stream == nullptr) {
            fprintf(stderr, "Could not allocate audio stream\n");
            exit(1);
        }

        audio_enc_ctx = avcodec_alloc_context3(audio_encoder);

        if (!audio_enc_ctx) {
            fprintf(stderr, "Could not allocate audio codec context\n");
            exit(1);
        }

        // 比特率让编码器自动处置，如果修改了，可能导致音频时间变短，原因暂时未知
        //m_pAudioEncoderCtx->bit_rate = 64000;
        /* 检查编码器是否支持对应的 音频格式 */
        audio_enc_ctx->sample_fmt = AV_SAMPLE_FMT_S16;

        if (!check_sample_fmt(audio_encoder, audio_enc_ctx->sample_fmt)) {
            fprintf(stderr, "Encoder does not support sample format %s\n",
                    av_get_sample_fmt_name(audio_enc_ctx->sample_fmt));
            exit(1);
        }

        //audio_enc_ctx->sample_rate = select_sample_rate(audio_encoder);
        audio_enc_ctx->sample_rate = 44100;
        //audio_enc_ctx->channel_layout = select_channel_layout(audio_encoder);
        audio_enc_ctx->channel_layout = AV_CH_LAYOUT_MONO;
        audio_enc_ctx->channels       = av_get_channel_layout_nb_channels(audio_enc_ctx->channel_layout);
        //audio_enc_ctx->channels = 1;
        ret = avio_open(&out_fmt_ctx->pb, aac_output_filename.c_str(), AVIO_FLAG_WRITE);

        if (ret < 0) {
            fprintf(stderr, "avio_open fail \n");
            exit(1);
        }

        /* open it */
        if (avcodec_open2(audio_enc_ctx, audio_encoder, nullptr) < 0) {
            fprintf(stderr, "Could not open audio codec\n");
            exit(1);
        }

        ret = avcodec_parameters_from_context(out_stream->codecpar, audio_enc_ctx);

        if (ret < 0) {
            printf("avcodec_parameters_from_context fail \n");
        }

        ret = avformat_write_header(out_fmt_ctx, nullptr);

        if (ret < 0) {
            printf("avformat_write_header fail \n");
        }

        std::cout << "------ init audio encoder success. ------" << std::endl;
        printf("dst sample_rate: %7d\n", audio_enc_ctx->sample_rate);
        printf("dst channels: %7d\n",    audio_enc_ctx->channels);
        printf("dst sample_fmt: %7d\n",  audio_enc_ctx->sample_fmt);
        printf("dst bit_rate: %7lld\n",  audio_enc_ctx->bit_rate);
        printf("dst frame size: %7d\n",  audio_enc_ctx->frame_size);
        std::cout << "-----------------------------------------" << std::endl;
        dst_sample_rate = audio_enc_ctx->sample_rate;
        dst_sample_fmt =  audio_enc_ctx->sample_fmt;
        dst_ch_layout =   audio_enc_ctx->channel_layout;
        dst_nb_channels = audio_enc_ctx->channels;
    } else {
        fprintf(stderr, "avformat_alloc_output_context2 fail \n");
        return ret;
    }

    FILE *outputFile = fopen("out.pcm", "wb+");
    int iFrameCount = 0;
    int64_t pts = 0;
    AVFrame *frame = av_frame_alloc();
    AVPacket *pkt = av_packet_alloc();
    AVFrame *out_frame = nullptr;
    AVPacket *out_pkt = av_packet_alloc();

    while (iFrameCount < 300) {
        // 不断采集数据
        ret = av_read_frame(fmt_ctx, pkt);

        if (ret == 0) {
            if (pkt->stream_index == audio_stream_index) {
                ret = avcodec_send_packet(audio_dec_ctx, pkt);

                while (ret >= 0) {
                    ret = avcodec_receive_frame(audio_dec_ctx, frame);

                    if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
                        break;
                    } else if (ret < 0) {
                        std::cerr << "avcodec_receive_frame failed" << std::endl;
                        return -1;
                    }

                    iFrameCount++;
                    //printf("====== audio frame n:%d  coded_n:%d  \n", iFrameCount++, frame->coded_picture_number);
                    ret = av_frame_make_writable(frame);

                    if (ret < 0) {
                        fprintf(stderr, "av_frame_make_writable failed, error: %s\n", av_err2str(ret));
                        exit(-1);
                    }

                    // start resampling
                    // fwrite(frame->data[0], 1,
                    //        frame->nb_samples *
                    //        av_get_bytes_per_sample(static_cast<AVSampleFormat>(frame->format)),
                    //        outputFile);

                    if (nullptr == swr_ctx) {
                        /**
                             * 以下可以使用 swr_alloc、av_opt_set_channel_layout、av_opt_set_int、av_opt_set_sample_fmt
                             * 等API设置，更加灵活
                             */
                        swr_ctx = swr_alloc_set_opts(nullptr, dst_ch_layout, dst_sample_fmt, dst_sample_rate,
                                                     frame->channel_layout, (AVSampleFormat)frame->format,
                                                     frame->sample_rate, 0, nullptr);
                        swr_init(swr_ctx);
                    }

                    // 进行音频重采样
                    int src_nb_sample = frame->nb_samples;
                    src_sample_rate = frame->sample_rate;
                    // 为了保持从采样后 dst_nb_samples / dest_sample = src_nb_sample / src_sample_rate
                    max_dst_nb_samples = dst_nb_samples = av_rescale_rnd(src_nb_sample, dst_sample_rate, src_sample_rate, AV_ROUND_UP);
                    // 从采样器中会缓存一部分，获取缓存的长度
                    // int64_t delay = swr_get_delay(swr_ctx, frame->sample_rate);
                    // dst_nb_samples = av_rescale_rnd(delay + frame->nb_samples, dst_sample_rate, frame->sample_rate,
                    //                                 AV_ROUND_UP);

                    if (nullptr == out_frame) {
                        out_frame = alloc_audio_frame(dst_sample_fmt, dst_ch_layout, dst_sample_rate, dst_nb_samples);
                        av_frame_make_writable(out_frame);
                    }

                    //
                    //    std::cout << "-------- dst audio information. --------" << std::endl;
                    //    printf("dst sample_rate: %d\n", audio_dst_sample_rate);
                    //    printf("dst channels: %d\n", av_get_channel_layout_nb_channels(audio_dst_channel_layout));
                    //    printf("dst sample_fmt: %d\n", audio_dst_sample_fmt);
                    //    printf("dst nb_samples: %d\n", dst_nb_samples);
                    //    std::cout << "-----------------------------------------" << std::endl;

                    if (dst_nb_samples > max_dst_nb_samples) {
                        // frame 大小不一致，重新申请内存
                        printf("dst nb samples: %d,max nb samples: %d\n", dst_nb_samples, max_dst_nb_samples);
                        av_frame_free(&out_frame);
                        out_frame = alloc_audio_frame(dst_sample_fmt, dst_ch_layout, dst_sample_rate, dst_nb_samples);
                        av_frame_make_writable(out_frame);
                        max_dst_nb_samples = dst_nb_samples;
                    }

                    // 重采样，返回的是每个通道的重采样数
                    //int iConvertSampleCount = swr_convert(swr_ctx, out_frame->data, dst_nb_samples,
                     //                                     const_cast<const uint8_t **>(frame->data), src_nb_sample);
                    static int sum = 0;
                    sum++;
                    ret = swr_convert_frame(swr_ctx, out_frame, frame);

                    if (ret < 0) {
                        printf("swr_convert_frame fail %d", ret);
                        continue;
                    }

                    int iConvertSampleCount = out_frame->nb_samples;

                    if (iConvertSampleCount > 0) {
                        printf("dst nb samples: %d,max nb samples %d, convert samples: %d\n", dst_nb_samples, max_dst_nb_samples, iConvertSampleCount);
                        //printf("src channel layout: %llu, dst channel layout: %llu\n", m_pFrame->channel_layout, audio_dst_channel_layout);
                        //printf("src rate: %d, dst rate: %d\n", src_sample_rate, audio_dst_sample_rate);
                        //printf("src fmt: %d, dst fmt: %d\n", m_pFrame->format, audio_dst_sample_fmt);
                        //printf("iConvertSampleCount: %d\n", iConvertSampleCount);
                        // 通过channel layout 获取通道数
                        int dst_nb_channels = av_get_channel_layout_nb_channels(dst_ch_layout);
                        //printf("dst_nb_channels: %d, m_pOutFrame->channels: %d\n", dst_nb_channels, m_pOutFrame->channels);
                        // 每帧音频数据量的大小
                        int iPerSampleSize = av_get_bytes_per_sample(static_cast<AVSampleFormat>(out_frame->format));
                        // planar 每个声道存储在data[i]中，FLTP
                        // 不带P，packed，LRLRLR，每个声道间隔存储
                        // 判断是否planar格式
                        // 详细参考源码 samplefmt.c
                        int planar = av_sample_fmt_is_planar(dst_sample_fmt);
                        // planar 通道数就是channels
                        // packed 的话，通道数是1，所有通道数据都在data[0]
                        int planes = planar ? dst_nb_channels : 1;
                        //printf("===========  planes: %d\n", planes);
                        // planar 一个数据块大小，就是一个frame的大小
                        // packed 一个数据块大小，其实是多个通道的数据，也就是*dst_nb_channels
                        int block_align = av_get_bytes_per_sample(dst_sample_fmt) * (planar ? 1 : dst_nb_channels);
                        int data_size = iConvertSampleCount * block_align;
                        //printf("persample size: %d, block size: %d, data size： %d\n", iPerSampleSize, block_align, data_size);

                        // 多通道数据存放在不同的data数组中，需要循环写入
                        // pcm文件默认使用packed ，LR间隔存储
                        for (int i = 0; i < iConvertSampleCount; i++) {
                            for (int ch = 0; ch < planes; ch++) {
                                fwrite(out_frame->data[ch] + block_align * i, 1, block_align, outputFile);
                            }
                        }

                        //        printf("out_samples: %d, cov samples %d\n",m_pOutFrame->nb_samples, iConvertSampleCount);
                        //out_frame->nb_samples = iConvertSampleCount;
                        //pts += iConvertSampleCount;
                        //m_iPts += dst_nb_samples;
                        out_frame->pts = pts++;
                        // out_frame->pts = av_rescale_q(pts, (AVRational) {
                        //     1, dst_sample_rate
                        // }, audio_enc_ctx->time_base);
                        // printf("dst_nb_samples: %d, convert_count: %d, samples_count: %d, sample_rate： %d, time_base den: %d, pts: %d\n",
                        //        dst_nb_samples, iConvertSampleCount, pts, audio_enc_ctx->sample_rate, audio_enc_ctx->time_base.den, out_frame->pts);
                        // pts += iConvertSampleCount;
                        encode(out_fmt_ctx, audio_enc_ctx, out_stream, out_pkt, out_frame);

                        while (swr_convert_frame(swr_ctx, out_frame, nullptr) >= 0) {
                            if (out_frame->nb_samples <= 0) {
                                break;
                            }

                            printf("还有余量 %d", out_frame->nb_samples);
                            out_frame->pts = pts++;
                            encode(out_fmt_ctx, audio_enc_ctx, out_stream, out_pkt, out_frame);
                        }
                    }
                }
            }

            av_packet_unref(pkt);
        } else {
            // 资源还没准备好，重新读取
            if (ret == AVERROR(EAGAIN)) {
                continue;
            }

            printf("\n====== av_read_frame failed, ret: %s\n",
                   av_err2str(ret));
            break;
        }
    }

    encode(out_fmt_ctx, audio_enc_ctx, out_stream, out_pkt, nullptr);
    ret = av_write_trailer(out_fmt_ctx);

    if (ret < 0) {
        printf("av_write_trailer fail \n");
    }

    const char *fmt;
    get_format_from_sample_fmt(&fmt, audio_enc_ctx->sample_fmt);
    fprintf(stderr,
            "Resampling succeeded. Play the output file with the command:\n"
            "ffplay -f %s -channel_layout %" PRId64 " -channels %d -ar %d %s\n",
            fmt, audio_enc_ctx->channel_layout,
            audio_enc_ctx->channels, audio_enc_ctx->sample_rate,
            "out.pcm");
    // 关闭文件
    fclose(outputFile);
    return 0;
}