ffmpeg学习（6）音频解码、音频数据处理

最新推荐文章于 2024-09-05 17:13:45 发布

aworkholic

最新推荐文章于 2024-09-05 17:13:45 发布

阅读量2k

点赞数 1

本文链接：https://blog.csdn.net/wanggao_1990/article/details/115723020

版权

音视频编解码专栏收录该内容

38 篇文章 92 订阅

订阅专栏

ffmpeg项目巨大，本文针对 ffmpeg学习（3）编码、解码的流程介绍中解码部分，实现了一个最简单的音频解码器，之后再对解码后的音频数据进行处理，例如调整位深度/采样率。最后将音频数据保存为pcm文件并使用Audacity播放。先给出基本的流程，如下图。
在这里插入图片描述
流程图中使用的函数，可以通过其名称了解其用处，网上参考较多，也可以直接看api文档。

其中关于结构体AVFrame和AVPackt的介绍参看文章：
ffmpeg学习结构体分析AVFrame
ffmpeg学习结构体分析AVPack

关于音频采样数据PCM知识的文章
ffmpeg学习音频采样数据PCM

ffmpeg_audio_decoder示例代码

对照流程图实现，注释较多，直接给出代码。关于解码后的图像像素数据处理，放在本文后面说明。

#include <stdio.h>

#ifdef __cplusplus  
extern "C" {
#endif  

#include "libavformat/avformat.h"

#ifdef __cplusplus  
}
#endif 

int main()
{
    // 打开输入
    //const char* input_file = "../files/Titanic.mkv";
    const char* input_file = "../files/Titanic.mp4";
    //const char* input_file = "../files/BladeRunner2049.h264";
    //const char* input_file = "rtmp://58.200.131.2:1935/livetv/cctv1";
    //const char* input_file = "http://ivi.bupt.edu.cn/hls/cctv6.m3u8";

    int ret;
    AVFormatContext* input_fmt_ctx = NULL; // 必须设置NULL

    if((ret = avformat_open_input(&input_fmt_ctx, input_file, NULL, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot open input file\n");
        return ret;
    }

    // 分析流信息
    if((ret = avformat_find_stream_info(input_fmt_ctx, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
        return ret;
    }

    // 打印信息
    av_dump_format(input_fmt_ctx, 0, input_file, 0);

    //---------------------- 解码部分 ----------------------// 
    int audio_stream_index = -1;
    AVCodec *audio_codec;
    AVCodecContext *audio_decoder_ctx;

    // 查找音频流 
    if((ret = av_find_best_stream(input_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &audio_codec, -1)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find an video stream in the input file\n");
        avformat_close_input(&input_fmt_ctx);
        return ret;
    }
    audio_stream_index = ret;

    // 解码器初始化
    AVCodecParameters *codecpar = input_fmt_ctx->streams[audio_stream_index]->codecpar;

    audio_codec = avcodec_find_decoder(codecpar->codec_id);
    if(!audio_codec) {
        av_log(NULL, AV_LOG_ERROR, "Can't find decoder\n");
        return -1;
    }

    audio_decoder_ctx = avcodec_alloc_context3(audio_codec);
    if(!audio_decoder_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Could not allocate a decoding context\n");
        avformat_close_input(&input_fmt_ctx);
        return AVERROR(ENOMEM);
    }

    // 解码器参数配置
    if((ret = avcodec_parameters_to_context(audio_decoder_ctx, codecpar)) < 0) {
        avformat_close_input(&input_fmt_ctx);
        avcodec_free_context(&audio_decoder_ctx);
        return ret;
    }

    // 打开解码器
    if((ret = avcodec_open2(audio_decoder_ctx, audio_codec, NULL)) < 0) {
        avformat_close_input(&input_fmt_ctx);
        avcodec_free_context(&audio_decoder_ctx);
        return ret;
    }

    // 解码并保存到文件 
    uint32_t frameCnt = 0;

    AVPacket *pkt = av_packet_alloc(); // 分配一个AVPactet对象，用于管理其缓冲区
    AVFrame *frame = av_frame_alloc(); // 分配一个AVFrame对象，用于管理其缓冲区

FILE *fpcm = fopen("out.pcm", "wb");

    while(av_read_frame(input_fmt_ctx, pkt) >= 0) { // 循环从输入获取一帧压缩编码数据，分配pkt缓冲区  

        // 仅处理视频码流
        if(pkt->stream_index != audio_stream_index) 
            continue;

        ret = avcodec_send_packet(audio_decoder_ctx, pkt);  // 送一帧到解码器

        while(ret >= 0) {
            ret = avcodec_receive_frame(audio_decoder_ctx, frame); // 尝试获取解码数据，分配frame缓冲区
            if(ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                break;
            }
            else if(ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "Error while sending a packet to the decoder\n");
                goto end;
            }

            // 解码的音频数据处理
            printf("\rSucceed to decode frame %d\n", frameCnt++);

            av_frame_unref(frame);  // 释放frame缓冲区数据
        }      
        av_packet_unref(pkt); // 释放pkt缓冲区数据
}

// 音频不需要Flush ??

end:
    // 关闭输入
    avcodec_free_context(&audio_decoder_ctx);
    avformat_close_input(&input_fmt_ctx);

    av_packet_free(&pkt);
    av_frame_free(&frame);

    fclose(fpcm);

    return 0;
}

解码后的音频采样数据pcm

在保存pcm文件前，先了解一点PCM一些知识。

对于双声道的PCM文件，按照采样时间顺序依次将左、右两个数据依次存入（格式为LRLRLR…）；对于单声道的PCM数据，通常是按照采样时间顺序依次存入（有时候也会以双声道方式存入，只是另一个声道数据全为0）。

经ffmpeg解码的音频帧一般会有多个pcm采样数据（AVFrame->nb_samples），例如MP3为1152，AAC_LC为1024，AAC_HE为2048。根据采样数据的布局和格式，我们可以算出每个帧中采样数据的内存占用，例如双声道AV_CH_LAYOUT_STEREO 布局，采样格式AV_SAMPLE_FMT_S16，一个AAC_LC帧采样数据占用 2*(16/8)*1024 =4096字节。对于采样率为44100Hz，一个音频帧的播放时间为 1024/44100 ≈ 23.22ms。对于1分钟的pcm音频文件，其大小计算公式为 44100*(16/8)*2*60 ≈ 10M。

修改解码音频数据处理部分如下，

	// 解码的音频数据处理
	int in_sample_bytes = av_get_bytes_per_sample(audio_decoder_ctx->sample_fmt);
	
	for(int n = 0; n < frame->nb_samples; n++)
	    for(int c = 0; c < frame->channels; c++)
	        fwrite(frame->data[c] + n*in_sample_bytes, in_sample_bytes, 1, fpcm);
	printf("\rSucceed to decode frame %d\n", frameCnt++);

编译运行程序，输入视频中的音频信息和保存后的文件大小截图如下。

在这里插入图片描述
验证文件大小 48.03480002*(32/8/1024)= 18,443,520‬Bytes = 18011.24KB。

使用audacity导入原始数据并配置播放选项，输入视频的音频信息为48000 Hz, stereo, fltp，修改如下图，点击导入后就可以正常播放了。
在这里插入图片描述

采样格式转换

这里先考虑音频格式（声道数、数据类型、采样数据包装格式）的转换，后续再讨论采样频率的转换（重采样）。

例如，源输入音频格式和布局为AV_SAMPLE_FMT_FLTP和AV_CH_LAYOUT_STEREO，音频格式的转换可以手工处理，也可以使用libresample提供的swr_convert()函数处理，这里先演示手工处理并保存为文件。

1 修改数据类型

采样数据从32位float类型数据转换位无符号8位uchar类型，需要将取值范围转换到[0,255]。

将原来保存pcm的代码做简单处理如下

for(int n = 0; n < frame->nb_samples; n++)
    for(int c = 0; c < frame->channels; c++) {
        float vsrc = *(float *)(frame->data[c] + n*in_sample_bytes);
        unsigned char vdst = (vsrc*128 + 128);
        fwrite(&vdst, sizeof(unsigned char), 1, fpcm);
    }

保存后的pcm文件应该为原来的1/4大小。

若将float类型输出其他有符号类型，如signed short(-32768~32767)，计算方式为 vdst = vsrc*32768。

2 修改声道数

从原来的2个通道，保存为1个通道。两种处理方式，仅保留其中一个通道，或者两个通道平均。这里给出平均的情况。

for(int n = 0; n < frame->nb_samples; n++) {
    float vdst = 0;
    for(int c = 0; c < frame->channels; c++) 
        vdst += *(float *)(frame->data[c] + n*in_sample_bytes);
    vdst /= frame->channels;
    fwrite(&vdst, sizeof(float), 1, fpcm);
}

3 修改数据包装格式

前面两个示例，保存的方式其实就是将包装格式plannar转换为packed。

4 综合改变并使用libswresample库函数

libswresample库函数swr_convert()使用方法具体见博客 ffmpeg学习函数分析swr_convert。

同视频数据转换使用libswscale库方式，音频数据转换结果可以手动分配内存、也可以使用AVFrame来分配缓冲区来保存转换后的结果。

首先确认音频转换参数、分配swr_contex：

 输出
uint64_t out_channel_layout = AV_CH_LAYOUT_STEREO;
//nb_samples: AAC-1024 MP3-1152
int out_nb_samples = audio_decoder_ctx->frame_size;
AVSampleFormat out_sample_fmt =  AV_SAMPLE_FMT_S16P;  //audio_decoder_ctx->sample_fmt;
int out_sample_rate = audio_decoder_ctx->sample_rate; 
int out_channels = av_get_channel_layout_nb_channels(out_channel_layout);

SwrContext *swr_ctx = NULL;
swr_ctx = swr_alloc_set_opts(swr_ctx, out_channel_layout, out_sample_fmt, out_sample_rate,
                             audio_decoder_ctx->channel_layout, //有些音频只有通道数，没有layout,需要使用av_get_default_channel_layout(chs)获取
                             audio_decoder_ctx->sample_fmt,
                             audio_decoder_ctx->sample_rate, 0, NULL);

if(swr_ctx && swr_init(swr_ctx)) {
    av_log(NULL, AV_LOG_ERROR,"swr_alloc_set_opts or swr_init failed.");
    return 0;
}

分配用于保存转换后音频数据的缓冲区域

AVFrame *frame_aud = av_frame_alloc();
frame_aud->channel_layout = out_channel_layout;
frame_aud->format = out_sample_fmt;
//frame_aud->sample_rate = out_sample_rate;
av_samples_alloc(frame_aud->data, frame_aud->linesize, out_channels, out_nb_samples, out_sample_fmt,1);

int out_sample_bytes = av_get_bytes_per_sample(out_sample_fmt);
int out_sample_is_plannar = av_sample_fmt_is_planar(out_sample_fmt);

对解码音频数据进行转换处理

swr_convert(swr_ctx, (uint8_t **)&frame_aud->data[0], frame->nb_samples, (const uint8_t**)&frame->data[0], frame->nb_samples);

// FLTP => S16
int channles = av_get_channel_layout_nb_channels(frame_aud->channel_layout);   
if(out_sample_is_plannar) {
    for(int n = 0; n < frame->nb_samples; n++)
        for(int c = 0; c < channles; c++)
            fwrite(frame_aud->data[c] + n*out_sample_bytes, out_sample_bytes, 1, fpcm);
}
else {  // packed
    fwrite(frame_aud->data[0], frame->nb_samples * out_sample_bytes * channles, 1, fpcm);
}

对于明确保存为Packed包装格式到文件中，不论是单声道还是多声道，数据是在内存中连续存储的，因此直接手动分配。

/ 方式2
uint8_t *out_buf;
// 仅packed包装格式
int out_buffer_size = av_samples_get_buffer_size(NULL, out_channels, out_nb_samples, out_sample_fmt, 1);
out_buf = (uint8_t *)av_malloc(out_buffer_size);
//av_samples_alloc(&out_buf, NULL, out_channels, out_nb_samples, out_sample_fmt, 1);// 效果同上

处理方式

ret = swr_convert(swr_ctx, &out_buf, out_nb_samples, (const uint8_t **)&frame->data[0], frame->nb_samples);
if(ret == 0) {
    continue; // 解码采样数据不足 out_nb_samples
}
else if(ret < 0) {
    av_log(NULL, AV_LOG_ERROR, "swr_convert error\n");
    goto end;
}
fwrite(out_buf, out_buffer_size, 1, fpcm);

对于Planar包装格式的数据，也同样可以使用裸指针，如下，但是这一种方式和使用AVFrame无异了。

uint8_t *out_buf[2];
av_samples_alloc(out_buf, NULL, out_channels, out_nb_samples, out_sample_fmt, 1);

注意：以上所有使用指针分配及AVFrame分配的缓冲区都需要进行资源释放处理。

音频数据重采样

提高采样频率，需要在原有的数据中进行插值，这种使用情况通常较少。

频率降低时，相同时间的采样数据会降低，反应在一帧数据中的解码数据会将变少。

例如以上面输入音频为例，采样率48000Hz，每一帧1024个样本，当采样率降低到8000Hz，那么需要原采样数据上每间隔6个保留一个，那么1024个采样数据会等间隔采样后保留1024*(8000/48000) = 1024/6，取整为170个样本。

手动处理

注意，间隔6个样本保存一次

for(int n = 0; n < frame->nb_samples; n+=6)
  for(int c = 0; c < frame->channels; c++) {
      float vsrc = *(float *)(frame->data[c] + n*in_sample_bytes);
      char vdst = vsrc*128;
      fwrite(&vdst, sizeof(char), 1, fpcm);
  }

保存之后播放，无任何问题。

使用libswresample库处理

定义相关参数

     输出
    uint64_t out_channel_layout = AV_CH_LAYOUT_STEREO;
    //nb_samples: AAC-1024 MP3-1152
    int out_nb_samples = audio_decoder_ctx->frame_size; // 1024
	AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_S16;
	int out_sample_rate = audio_decoder_ctx->sample_rate;
	int out_channels = av_get_channel_layout_nb_channels(out_channel_layout);
	
	// 采样率发生变化，实际的输出音频采样数需要进行调整， round_up(1024 * (8000/48000)) = 171
	out_sample_rate /= 6; // 48000 -> 8000 
	out_nb_samples = av_rescale_rnd(audio_decoder_ctx->frame_size, out_sample_rate, audio_decoder_ctx->sample_rate, AV_ROUND_UP);
	
	SwrContext *swr_ctx = NULL;
	swr_ctx = swr_alloc_set_opts(swr_ctx, out_channel_layout, out_sample_fmt, out_sample_rate,
	                             audio_decoder_ctx->channel_layout, //有些音频只有通道数，没有layout,需要使用av_get_default_channel_layout(chs)获取
	                             audio_decoder_ctx->sample_fmt,
	                             audio_decoder_ctx->sample_rate, 0, NULL);
	
	if(swr_ctx && swr_init(swr_ctx)) {
	    av_log(NULL, AV_LOG_ERROR,"swr_alloc_set_opts or swr_init failed.");
	    return 0;
	}

packed格式，采用手动分配缓冲

uint8_t *out_buf;
// 仅packed包装格式
int out_buffer_size = av_samples_get_buffer_size(NULL, out_channels, out_nb_samples, out_sample_fmt, 1);
out_buf = (uint8_t *)av_malloc(out_buffer_size);

转换处理，并保存

ret = swr_convert(swr_ctx, &out_buf, out_nb_samples, (const uint8_t **)&frame->data[0], frame->nb_samples);
if(ret == 0) {
    continue; // 解码采样数据不足 out_nb_samples
}
else if(ret < 0) {
    av_log(NULL, AV_LOG_ERROR, "swr_convert error\n");
    goto end;
}
fwrite(out_buf, out_buffer_size, 1, fpcm);

保存后选择对应的音频参数播放，发现主体声音正常，但是每隔一段时间就会出现一个噪音。对于1024个样本下采样6次，在进行第171次时对应原采样数据的第171*6 = 1026个，是不存在的。因此out_nb_samples计算调用函数av_rescale_rnd的最后一个参数应该改为向下取整AV_ROUND_DOWN，取值170，再次处理保存播放就一切正常了。

使用AVFrame分配缓冲区，支持单声道或多声道，相对较简单。

AVFrame *frame_aud = av_frame_alloc();
frame_aud->channel_layout = out_channel_layout;
frame_aud->format = out_sample_fmt;
frame_aud->sample_rate = out_sample_rate;
frame_aud->nb_samples = out_nb_samples;
av_samples_alloc(frame_aud->data, frame_aud->linesize, out_channels, out_nb_samples, out_sample_fmt,1);

转换保存部分如下

if(ret < 0) {
    av_log(NULL, AV_LOG_ERROR, "swr_convert error\n");
    goto end;
}
else if(ret > 0) {
    int channles = av_get_channel_layout_nb_channels(frame_aud->channel_layout);
    if(out_sample_is_plannar) {
        for(int n = 0; n < frame_aud->nb_samples; n++)
            for(int c = 0; c < channles; c++)
                fwrite(frame_aud->data[c] + n*out_sample_bytes, out_sample_bytes, 1, fpcm);
    }
    else {  // packed
        //fwrite(frame_aud->data[0], frame_aud->nb_samples * out_sample_bytes * channles, 1, fpcm);
        fwrite(frame_aud->data[0], frame_aud->linesize[0], 1, fpcm); // 与上等效
    }
}

关于swr_convert函数的返回值的三种情况，当ret<0为出错，ret==0说明当前帧输入样本数不足以转码，ret>0说明转码成功，并且ret==out_count==out_nb_samples（后续有特殊情况会说明）。

目前测试，当ret >= 0情况下保存文件大小比ret > 0 时保存的文件大小，播放仍然是正常的。这种情况会出现在第一帧样本不足in_nb_samples，返回值ret == 0，虽然保存了out_buf数据，但是由于是第一次使用初始化全为0且未被swr_convert修改，所以会多出out_nb_samples全为零的采样数据。下面给出几个截图
在这里插入图片描述
从波形图中可以看出，确实是多了一些采样数据全零，时间大约为22ms。多出的680个字节，正好是170个采样数据的字节数1702(16/8) = 680，这170个采样数据占用的时间为170/8000=21.25ms。

音频数据缓冲问题

紧接着上面截图，注意视频实际的总时长是48.03s，而音频数据仅有47.833s，相差0.2s?想到前面文章中视频解码时需要Flush decoder，在解码循环结束后的添加代码测试

// Flush decoder
printf("Flush decoder....\n");
while(1) {
    ret = avcodec_send_packet(audio_decoder_ctx, NULL);  // 发送空包到解码器,一定返回0
    if(ret < 0)
        break;
    while(ret > 0) {
        ret = avcodec_receive_frame(audio_decoder_ctx, frame);
        if(ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            break;
        }
        else if(ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "Error while sending a packet to the decoder\n");
            goto end;
        }
        printf("Succeed to decode frame %d\n", frameCnt++);
        av_frame_unref(frame);  // 释放frame缓冲区数据
    }
}

重新运行，发现解码器中并没有缓冲数据。通常一帧解码数据中，会有很多个
在这里插入图片描述
注意到swr_convert的说明，可能其中缓冲了部分数据，根据要求继续增加如下测试代码

    printf("Flush samples \n", frameCnt++);
    do {
        ret = swr_convert(swr_ctx, (uint8_t **)&frame_aud->data[0], frame_aud->nb_samples,
                                   NULL,0);          
        if(ret > 0) {
            int channles = av_get_channel_layout_nb_channels(frame_aud->channel_layout);
            if(out_sample_is_plannar) {
                for(int n = 0; n < frame_aud->nb_samples; n++)
                    for(int c = 0; c < channles; c++)
                        fwrite(frame_aud->data[c] + n*out_sample_bytes, out_sample_bytes, 1, fpcm);
            }
            else {  // packed
                //fwrite(frame_aud->data[0], frame_aud->nb_samples * out_sample_bytes * channles, 1, fpcm);
                fwrite(frame_aud->data[0], frame_aud->linesize[0], 1, fpcm); // 与上等效
            }
        }

        printf("Succeed to decode frame %d  samples %d \n", frameCnt++, ret);

} while(ret);

控制台输出和文pcm文件信息如下
在这里插入图片描述
相对于未缓冲sample数据时，现在音频采样数据pcm的播放时长为48.02，增加了48.02-47.833=0.187s，与视频时长相差0.01s。增加保存170*8+165=1525个样本，时长大约为1525/8000 = 0.190s，字节数为1525*2*(16/8) = 6100。相比未缓冲时保存文件大小增加1536800-1536800=6120字节，相差20个字节。由于代码中每次均按照170个采样数据字节保存，相差5采样数据的大小，(170-165)2(16/8)=20。

根据实际输出缓冲sample数据，调整保存文件部分的代码，如下

printf("Flush samples \n", frameCnt++);
do {
    //ret = swr_convert(swr_ctx, &out_buf, out_nb_samples, NULL, 0);
    //if(ret > 0)
    //    fwrite(out_buf, ret*out_sample_bytes, 1, fpcm);

    ret = swr_convert(swr_ctx, (uint8_t **)&frame_aud->data[0], frame_aud->nb_samples,
                               NULL,0);          
    if(ret > 0) {
        int channles = av_get_channel_layout_nb_channels(frame_aud->channel_layout);
        if(out_sample_is_plannar) {
            //for(int n = 0; n < frame_aud->nb_samples; n++)
            for(int n = 0; n < ret; n++)
                for(int c = 0; c < channles; c++)
                    fwrite(frame_aud->data[c] + n*out_sample_bytes, out_sample_bytes, 1, fpcm);
        }
        else {  // packed
            //fwrite(frame_aud->data[0], frame_aud->nb_samples * out_sample_bytes * channles, 1, fpcm);
            //fwrite(frame_aud->data[0], frame_aud->linesize[0], 1, fpcm); // 与上等效
            fwrite(frame_aud->data[0], ret* out_sample_bytes * channles, 1, fpcm); // 与上等效
        }
    }

    printf("Succeed to flush samples %d \n", ret);

} while(ret);

这里swr_convert的返回值在ret>0时，在最后一次调用不等于out_nb_samples。

ffmpeg_audio_decoder完整代码

#include <iostream>

#ifdef __cplusplus  
extern "C" {
#endif  

#include "libavformat/avformat.h"
//#include "libavutil/imgutils.h"
//#include "libswscale/swscale.h"

#include "libswresample/swresample.h"

#ifdef __cplusplus  
}
#endif 


int main()
{
    // 打开输入
    //const char* input_file = "../files/Titanic.mkv";
    const char* input_file = "../files/Titanic.mp4";
    //const char* input_file = "../files/BladeRunner2049.h264";
    //const char* input_file = "rtmp://58.200.131.2:1935/livetv/cctv1";
    //const char* input_file = "http://ivi.bupt.edu.cn/hls/cctv6.m3u8";

    int ret;
    AVFormatContext* input_fmt_ctx = NULL; // 必须设置NULL

    if((ret = avformat_open_input(&input_fmt_ctx, input_file, NULL, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot open input file\n");
        return ret;
    }

    // 分析流信息
    if((ret = avformat_find_stream_info(input_fmt_ctx, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
        return ret;
    }

    // 打印信息
    av_dump_format(input_fmt_ctx, 0, input_file, 0);

    //---------------------- 解码部分 ----------------------// 
    int audio_stream_index = -1;
    AVCodec *audio_codec;
    AVCodecContext *audio_decoder_ctx;

    // 查找音频流 
    if((ret = av_find_best_stream(input_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &audio_codec, -1)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find an video stream in the input file\n");
        avformat_close_input(&input_fmt_ctx);
        return ret;
    }
    audio_stream_index = ret;


    // 解码器初始化
    AVCodecParameters *codecpar = input_fmt_ctx->streams[audio_stream_index]->codecpar;

    audio_codec = avcodec_find_decoder(codecpar->codec_id);
    if(!audio_codec) {
        av_log(NULL, AV_LOG_ERROR, "Can't find decoder\n");
        return -1;
    }


    audio_decoder_ctx = avcodec_alloc_context3(audio_codec);
    if(!audio_decoder_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Could not allocate a decoding context\n");
        avformat_close_input(&input_fmt_ctx);
        return AVERROR(ENOMEM);
    }

    // 解码器参数配置
    if((ret = avcodec_parameters_to_context(audio_decoder_ctx, codecpar)) < 0) {
        avformat_close_input(&input_fmt_ctx);
        avcodec_free_context(&audio_decoder_ctx);
        return ret;
    }


    // 打开解码器
    if((ret = avcodec_open2(audio_decoder_ctx, audio_codec, NULL)) < 0) {
        avformat_close_input(&input_fmt_ctx);
        avcodec_free_context(&audio_decoder_ctx);
        return ret;
    }

    // 解码并保存到文件 
    uint32_t frameCnt = 0;

    AVPacket *pkt = av_packet_alloc(); // 分配一个AVPactet对象，用于管理其缓冲区
    AVFrame *frame = av_frame_alloc(); // 分配一个AVFrame对象，用于管理其缓冲区

    FILE *fpcm = fopen("out.pcm", "wb");



     输出
    uint64_t out_channel_layout = AV_CH_LAYOUT_STEREO;
    //nb_samples: AAC-1024 MP3-1152
    int out_nb_samples = audio_decoder_ctx->frame_size;
    AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_S16P;  //audio_decoder_ctx->sample_fmt;
    int out_sample_rate = audio_decoder_ctx->sample_rate;
    int out_channels = av_get_channel_layout_nb_channels(out_channel_layout);

    //out_sample_rate /= 6; // 48000 -> 8000
    out_sample_rate = 44100;
    out_nb_samples = av_rescale_rnd(audio_decoder_ctx->frame_size, out_sample_rate, audio_decoder_ctx->sample_rate, AV_ROUND_UP);


    SwrContext *swr_ctx = NULL;
    swr_ctx = swr_alloc_set_opts(swr_ctx, out_channel_layout, out_sample_fmt, out_sample_rate,
                                 audio_decoder_ctx->channel_layout, //有些音频只有通道数，没有layout,需要使用av_get_default_channel_layout(chs)获取
                                 audio_decoder_ctx->sample_fmt,
                                 audio_decoder_ctx->sample_rate, 0, NULL);

    if(swr_ctx && swr_init(swr_ctx)) {
        av_log(NULL, AV_LOG_ERROR, "swr_alloc_set_opts or swr_init failed.");
        return 0;
    }

     方式3   单或多声道
    AVFrame *frame_aud = av_frame_alloc();
    frame_aud->channel_layout = out_channel_layout;
    frame_aud->format = out_sample_fmt;
    frame_aud->sample_rate = out_sample_rate;
    frame_aud->nb_samples = out_nb_samples;
    av_samples_alloc(frame_aud->data, frame_aud->linesize, out_channels, out_nb_samples, out_sample_fmt,1);


    int out_sample_bytes = av_get_bytes_per_sample(out_sample_fmt);
    int out_sample_is_plannar = av_sample_fmt_is_planar(out_sample_fmt);

    int in_sample_bytes = av_get_bytes_per_sample(audio_decoder_ctx->sample_fmt);
    int isPlanar = av_sample_fmt_is_planar(audio_decoder_ctx->sample_fmt);



    while(av_read_frame(input_fmt_ctx, pkt) >= 0) { // 循环从输入获取一帧压缩编码数据，分配pkt缓冲区  

        // 仅处理视频码流
        if(pkt->stream_index != audio_stream_index)
            continue;

        ret = avcodec_send_packet(audio_decoder_ctx, pkt);  // 送一帧到解码器

        while(ret >= 0) {
            ret = avcodec_receive_frame(audio_decoder_ctx, frame); // 尝试获取解码数据，分配frame缓冲区
            if(ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                break;
            }
            else if(ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "Error while sending a packet to the decoder\n");
                goto end;
            }

            // 解码的音频数据处理
             方式3

            int dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx, frame->sample_rate) + frame->nb_samples, 
                                                frame_aud->sample_rate, frame->sample_rate, AV_ROUND_UP);

            if(dst_nb_samples > frame_aud->nb_samples) {
                av_frame_unref(frame_aud);

                frame_aud->channel_layout = out_channel_layout;
                frame_aud->format = out_sample_fmt;
                frame_aud->sample_rate = out_sample_rate;
                frame_aud->nb_samples = dst_nb_samples;
                av_samples_alloc(frame_aud->data, frame_aud->linesize, 
                                 out_channels, dst_nb_samples, out_sample_fmt, 1);

                if(ret < 0)
                    break;
            }

            ret = swr_convert(swr_ctx, (uint8_t **)&frame_aud->data[0], frame_aud->nb_samples, 
                                 (const uint8_t**)&frame->data[0], frame->nb_samples);

            if(ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "swr_convert error\n");
                goto end;
            }
            else if(ret > 0) {
                int channles = av_get_channel_layout_nb_channels(frame_aud->channel_layout);
                if(out_sample_is_plannar) {
                    for(int n = 0; n < ret /*frame_aud->nb_samples*/; n++)
                        for(int c = 0; c < channles; c++)
                            fwrite(frame_aud->data[c] + n*out_sample_bytes, out_sample_bytes, 1, fpcm);
                }
                else {  // packed
                    //fwrite(frame_aud->data[0], frame_aud->nb_samples * out_sample_bytes * channles, 1, fpcm);
                    //fwrite(frame_aud->data[0], frame_aud->linesize[0], 1, fpcm); // 与上等效
                    fwrite(frame_aud->data[0], ret* out_sample_bytes * channles, 1, fpcm); 
                }
            }

            printf("Succeed to decode frame %d samples %d\n", frameCnt++, frame->nb_samples);
           // printf("Succeed to decode frame %d  samples %d \n", frameCnt++, frame->nb_samples);

            av_frame_unref(frame);  // 释放frame缓冲区数据
        }

        av_packet_unref(pkt); // 释放pkt缓冲区数据
    }

    // Flush decoder
    printf("Flush decoder....\n");
    while(1) {
        ret = avcodec_send_packet(audio_decoder_ctx, NULL);  // 发送空包到解码器,一定返回0
        if(ret < 0)
            break;
        while(ret > 0) {
            ret = avcodec_receive_frame(audio_decoder_ctx, frame);
            if(ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                break;
            }
            else if(ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "Error while sending a packet to the decoder\n");
                goto end;
            }
            printf("Succeed to decode frame %d\n", frameCnt++);
            
            //….省略
            
            av_frame_unref(frame);  // 释放frame缓冲区数据
        }
    }

    printf("Flush samples \n", frameCnt++);
    do {
        //ret = swr_convert(swr_ctx, &out_buf, out_nb_samples, NULL, 0);
        //if(ret > 0)
        //    fwrite(out_buf, ret*out_sample_bytes, 1, fpcm);

        ret = swr_convert(swr_ctx, (uint8_t **)&frame_aud->data[0], frame_aud->nb_samples,
                                   NULL,0);          
        if(ret > 0) {
            int channles = av_get_channel_layout_nb_channels(frame_aud->channel_layout);
            if(out_sample_is_plannar) {
                //for(int n = 0; n < frame_aud->nb_samples; n++)
                for(int n = 0; n < ret; n++)
                    for(int c = 0; c < channles; c++)
                        fwrite(frame_aud->data[c] + n*out_sample_bytes, out_sample_bytes, 1, fpcm);
            }
            else {  // packed
                //fwrite(frame_aud->data[0], frame_aud->nb_samples * out_sample_bytes * channles, 1, fpcm);
                //fwrite(frame_aud->data[0], frame_aud->linesize[0], 1, fpcm); // 与上等效
                fwrite(frame_aud->data[0], ret* out_sample_bytes * channles, 1, fpcm); 
            }
        }

        printf("Succeed to flush samples %d \n", ret);

    } while(ret);

end:
    // 关闭输入
    avcodec_free_context(&audio_decoder_ctx);
    avformat_close_input(&input_fmt_ctx);

    av_packet_free(&pkt);
    av_frame_free(&frame);

    swr_free(&swr_ctx);
    av_frame_free(&frame_aud);

    fclose(fpcm);
    return 0;
}

当前问题：

1、当视频封装是mkv时，audio_decoder_ctx->frame_size 的值为零？？？？

2、由于swr_convert调用有时间间隔，输入samples可能会超过in_count，需要动态调整接收的缓冲区和out_nb_sample…….，若不这样处理，会出现上述需要缓冲很多次的情况，一旦解码时间过长，会导致音频数据持续堆积（内存占用持续增加）。官方resampling_audio.c代码中动态提高out_nb_sample值在一定程度解决了该问题。
保存为裸流无问题，但是在封装文件、推流、混流暂不清楚是否有影响（实测：直接写出实际转换的采样数据个数即可）。参考。。。。。。链接 ffmpeg学习（12）音视频转码（1）使用sws、swr。。。。。