音视频开发26 FFmpeg 音频重采样实战化思考前提 - 时间问题整理，avcodec_send_packet源码分析，avcodec_receive_frame源码分析

hunandede

已于 2024-06-15 11:02:02 修改

阅读量372

点赞数 4

文章标签：音视频 ffmpeg

于 2024-06-11 23:21:45 首次发布

本文链接：https://blog.csdn.net/hunandede/article/details/139611038

版权

time_base 、pts、dts、duration

time_base：时间基，所谓时间基表示的就是每个刻度是多少秒，例如
- 如果把1秒分为25等份，你可以理解就是一把尺，那么每一格表示的就是1/25秒。此时的time_base={1，25} ，
- 如果你是把1秒分成90000份，每一个刻度就是1/90000秒，此时的time_base={1，90000}。
- 在ffmpeg中。av_q2d(time_base)=每个刻度是多少秒
PTS：Presentation Time Stamp。PTS主要用于度量解码后的视频帧什么时候被显示出来，即显示时间戳，某一帧视频什么时候开始显示
- pts的值就是占多少个时间刻度（占多少个格子）。它的单位不是秒，而是时间刻度。
DTS：Decode Time Stamp。DTS主要是标识读入内存中的帧数据流在什么时候开始送入解码器中进行解码，即解码时间戳
duration：某一帧视频显示持续时间，duration和pts单位一样，duration表示当前帧的持续时间占多少格。

1.代码中的不同结构体中的 time_base，pts，dts，duration的含义到底是啥？

在学习这块的时候，发现AVPacket，AVFrame，AVStream中都有time_base，duration，AVPacket，AVFrame中都有pts 和 dts，如果搞不清这些值的真正含义，则有可能在计算的时候出现bug，且不知道怎么fix。

思路：弄一个mp4文件或者mp3文件，解复用，然后解码，中间debug看AVPacket，AVFrame，AVStream中间的这些值。当前测试用了一个mp3文件，主要是先测试一个纯音频。再测试一个h264的，最后再搞一个音视频结合的mp4.

我们就按照这个思路来学习，。先搞个mp3测试一下

1. 1Debug代码：

int DDUtils::MP3toH264(MP3Class & mp3instance, H264class & h264instance){
    //测试将mp3变成pcm数据。
    char *outfilename = "D:/ffplayresource/MP3TO.pcm";
    FILE *outfile = fopen(outfilename, "wb");

    cout<<"func MP3toH264 call start "<<endl;
    int ret =0;

    //解码相关，解码的很多参数是从 mp3instance文件中获得的，因此开始的时候，不能直接设置mp3instance的相关值。
    AVFormatContext* avFormatInputFileContext = nullptr;
    AVStream* mp3avstrem = nullptr;
    const AVCodec* mp3decoder = nullptr ;
    AVCodecContext* mp3decodercontext = nullptr;
    AVPacket* mp3avpacket = nullptr;
    AVFrame* mp3avframe = nullptr;


    //编码相关,我们编码是要变成aac，这是确定的，因此很多参数都是可以直接设定。
    //    AVFormatContext* avFormatOutputFileContext = nullptr;
    //    const AVOutputFormat *avOutputFormat = av_guess_format(nullptr, h264instance.filename.c_str(), nullptr);


    //音频重采样相关
    SwrContext * swrcontext = nullptr;


    //    AVAudioFifo 相关,我们的流程是这样的，将 源文件最终的pcm 通过 重采样成 目标文件的pcm，然后将本应该存到目标文件的pcm，存储到avaduiofifo
    AVAudioFifo *avaduiofifo = nullptr;


    if(mp3instance.filename == ""){

    }
    //第一步 负责申请一个AVFormatContext 结构的内存,并进行简单初始化
    avFormatInputFileContext = avformat_alloc_context();
    if(!avFormatInputFileContext){
        ret = -1;
        ERROR_BUF(ret);
        cout<<"error: avformat_alloc_context error"<<endl;
        goto ddutilsend;
    }


    //第二步：打开媒体文件并获取媒体文件信息的函数
    ret = avformat_open_input(&avFormatInputFileContext, (const char *)mp3instance.filename.c_str(), nullptr, nullptr);
    if(ret != 0){
        cout<<"error: avformat_open_input error"<<endl;
        ERROR_BUF(ret)
                goto ddutilsend;
    }

    //第三步 avformat_find_stream_info()：获取音视频文件信息,
    //avformat_find_stream_info()函数是用于获取媒体文件中每个音视频流的详细信息的函数，包括解码器类型、采样率、声道数、码率、关键帧等信息

    ret = avformat_find_stream_info(avFormatInputFileContext,nullptr);
    if(ret < 0){
        cout<<"error: avformat_find_stream_info error"<<endl;
        ERROR_BUF(ret)
                goto ddutilsend;
    }

    //第四步，找到最合适的audio 流
    mp3instance.best_audio_index = av_find_best_stream(avFormatInputFileContext,AVMEDIA_TYPE_AUDIO,-1,-1,nullptr,0);

    if(mp3instance.best_audio_index <0 ){
        ret = mp3instance.best_audio_index;
        cout<<"error: av_find_best_stream find audio error"<<endl;
        ERROR_BUF(ret)
                goto ddutilsend;
    }

    av_dump_format(avFormatInputFileContext, 0, mp3instance.filename.c_str(), 0);

    cout<<"func MP3toH264 call 111111 mp3instance.best_audio_index = "<< mp3instance.best_audio_index << endl;



    //第五步：找到解码器。到这里我们已经对于mp3的流进行了解封装，那么下来就应该对mp3进行解码了，解码是为了得到 pcm流，然后转换
    //5.1 那么第一个问题就是，我们怎么知道这个mp3文件用什么解码合适呢？这里就要用到 AVStream->codecpar->codec_id了。
    mp3avstrem = avFormatInputFileContext->streams[mp3instance.best_audio_index];
    cout<<"111"<<endl;
    printfAVStream(mp3avstrem);
    cout<<"222"<<endl;
    cout<<" mp3avstrem->codecpar->codec_id = " << mp3avstrem->codecpar->codec_id <<endl;
    
    mp3decoder  = avcodec_find_decoder(mp3avstrem->codecpar->codec_id);
    if(!mp3decoder){
        ret = -3;
        cout<<"error: avcodec_find_decoder find audio error"<<endl;
        goto ddutilsend;
    }

    //第六步：关联解码器上下文 avcodec_alloc_context3(): 分配解码器上下文
    mp3decodercontext = avcodec_alloc_context3(mp3decoder);
    if(!mp3decodercontext){
        ret = -4;
        cout<<"error: avcodec_alloc_context3 mp3decoder  error"<<endl;
        goto ddutilsend;
    }

    //第七步，给给解码器上下文添加参数, avcodec_parameters_to_context():

    ret = avcodec_parameters_to_context(mp3decodercontext, mp3avstrem->codecpar);

    if(ret < 0){
        cout<<"error: avcodec_parameters_to_context   error"<<endl;
        ERROR_BUF(ret)
                goto ddutilsend;
    }

    //第八步：打开编解码器 avcodec_open2()：

    ret = avcodec_open2(mp3decodercontext,mp3decoder,nullptr);

    if(ret < 0){
        cout<<"error: avcodec_open2   error"<<endl;
        ERROR_BUF(ret)
                goto ddutilsend;
    }

    //第十步，音频重采样相关
    swrcontext = swr_alloc();



    //第九步：到这里，就可以开始读取数据了,读取的数据要存储在avpacket中，因此先要调用av_packet_alloc创建一个avpacket
    //9.1 创建avpacket 和 avframe
    mp3avpacket =  av_packet_alloc();
    mp3avframe = av_frame_alloc();

    if(!mp3avpacket){
        ret = -5;
        cout<<"error: av_packet_alloc error"<<endl;
        goto ddutilsend;
    }
    //9.2 从avFormatContext 对应的file 中读取数据，ffmpeg中的av_read_frame()的作用是读取码流中的音频若干帧或者视频一帧
    // @return 0 if OK, < 0 on error or end of file
    while(av_read_frame(avFormatInputFileContext, mp3avpacket) >=0){


        //额外的说明，这时候mp3avpacket中存储的是压缩过的mp3格式的数据，如果我们是从mp4文件读取的avpacket，这时候mp3avpacket中应该存储的aac文件，
        //那么就可以直接存储这个aac文件，如果想不起来，可以回顾0702 从mp4文件中抽取 aac,这是因为从mp4中获取的是aac，和要存储的aac是一样的，加上aac需要的头部 adts_header，就可以生成需要的aac
        //可惜我们这里的目的是：

        //9.2.1那么这时候 mp3avpacket 中就有数据了，那么下来需要将这些 avpacket 的数据发送到解码器,发送完成后，需要调用av_packet_ubref 将 refcount减去1
        // debug 位置1
        avcodec_send_packet(mp3decodercontext,mp3avpacket);
        av_packet_unref(mp3avpacket);


        //9.2.2 这时候就可以从解码器中拿数据了,注意的是，一次send可能对应多次receive，因此这里也要用一个循环

        for(;;)
        {
            //从线程中获取解码接口,一次send可能对应多次receive,读取的数据这时候就已经到了mp3avframe中了，返回值是0表示成功。
            ret = avcodec_receive_frame(mp3decodercontext, mp3avframe);
            if (ret != 0) break;
            // debug 位置2
            static int s_print_format = 0;
            static int data_size =0;
            static int isplanar = 0;
            //根据自己在该方法前面加的log打印，就会明白，这里为什么要有一个 static int s_print_format，因为这个方法会不停的走进来，打印的太多了
            if(s_print_format == 0)
            {
                data_size = av_get_bytes_per_sample((enum AVSampleFormat)mp3avframe->format);
                isplanar = av_sample_fmt_is_planar((enum AVSampleFormat)mp3avframe->format);
                s_print_format = 1;
                print_sample_format(mp3avframe);
            }

            if(mydebug){
                cout << "recv frame " << mp3avframe->format
                     << " recv lizesize " << mp3avframe->linesize[0]
                     << " recv channel " <<mp3avframe->ch_layout.nb_channels
                     << " recv number of audio samples (per channel) " << mp3avframe->nb_samples
                     << endl;
            }


            //            recv frame 8 recv lizesize 4608 recv channel 2 recv number of audio samples (per channel) 47
            //            recv frame 8 recv lizesize 4608 recv channel 2 recv number of audio samples (per channel) 1152
            //            recv frame 8 recv lizesize 4608 recv channel 2 recv number of audio samples (per channel) 1152
            //            ......
            //            recv frame 8 recv lizesize 4608 recv channel 2 recv number of audio samples (per channel) 1106
            //从上面的log可以看出 format 是 AV_SAMPLE_FMT_FLTP ，每个声道占用的字节是 4608，recv channel 是2，第一次avframe中有47个样本数量，最后一次avframe有1106个字节，中间的都是1152个字节




            //这里在复习一下这个 4608是怎么来的，1152*4 = 4608 ，因为是planar的，因此不用乘声道，，这里应该是mp3的要求，
            //AV_SAMPLE_FMT_FLTP 说明 一个样本占用的字节是 4个字节，
            // AV_SAMPLE_FMT_FLTP 说明你是planar的，因此 avfame 的 data[0] 存储的是LLLLLL....,一共存储的是  个


            //如下的code 是将从avframe读出来的数据存储成 可以播放的 pcm文件，测试播放为： ffplay -ar 44100 -ac 2 -f f32le MP3TO.pcm
            //            if(isplanar){
            //                //planar 模式
            //                for (int i = 0; i < mp3avframe->nb_samples; i++)
            //                {
            //                    for (int ch = 0; ch < mp3avframe->ch_layout.nb_channels; ch++)  // 交错的方式写入, 大部分float的格式输出
            //                        fwrite(mp3avframe->data[ch] + data_size*i, 1, data_size, outfile);
            //                    //测试播放 ffplay -ar 44100 -ac 2 -f f32le MP3TO.pcm
            //                    //这里有一个问题，为什么是f32le，我们前面打印的是 AV_SAMPLE_FMT_FLTP，ffplay的-f 的参数是如何对应 AVSampleFormat的
            //                }
            //            }else{
            //                //交错模式
            //                fwrite(mp3avframe->data[0], 1, mp3avframe->ch_layout.nb_channels * mp3avframe->nb_samples * data_size, outfile);
            //            }

            //接着我们的思路进行处理。如果全部弄成pcm，然后再次编码成aac就可以了，但是到这里我们不使用这种方式，这种方式应该也是可以的。

            //接着主思路往下走，这时候 avframe中存储的原始的pcm，
            //将mp3转成aac，mp3的一个frame有1152样本帧，aac一个frame有1024个样本帧。
            //假设我们对于一帧的mp3 中1152中的1024个帧处理成一个 aac帧，那么剩余的1152-1024个帧，应该怎么办呢？
            //这里要借助， AVAudioFifo，也就是我们将读取到的数据，存储到 AVAudioFifo中，


        }


    }

先要解决debug前的一些问题

位置在 debug1，开始debug，这里有一个问题在debug的时候围绕着我，就是当调用

avcodec_send_packet(mp3decodercontext,mp3avpacket);

后，为什么要调用

av_packet_unref(mp3avpacket);

呢？

但是从

avcodec_receive_frame(mp3decodercontext, mp3avframe);

给mp3avframe 中写入数据后，为什么不用调用

av_frame_unref(mp3avframe)

呢？

不对称呀，没有问题吗？翻看了一下ffmpeg提供的例子，确实是这么写的呀，难道例子有内存泄漏？

基于上述这个问题，查看了下ffmpeg如下两个方法的源码：

int avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt);

int avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame);

如何查看ffmpeg的源码，下载ffmpeg的源码后，参考这个：使用sourceInsight 查看源代码-CSDN博客

avcodec_send_packet源码

从源码我们可以看到，AVCodecContex这个数据结构中有一个AVCodecInternal

AVCodecInternal *avci = avctx->internal;

在没有error发生的时候，调用 av_packet_unref(avci->buffer_pkt);将原本里面的buffer_pkt数据清空然后还原成默认值。

下来就是在条件成立的时候，调用ret = av_packet_ref(avci->buffer_pkt, avpkt);

这个av_packet_ref方法内部的实现实际上也就是将第二个参数的内容拷贝给第一个参数。

然后整个方法就结束了，从整个流程来看，

我们再调用 int avcodec_send_packet方法的整个过程中，调用了该方法：

av_packet_ref(avci->buffer_pkt, avpkt); 也就是说，增加了avpkt的引用计数，因此，我们在代码内部紧接着调用av_packet_unref(mp3avpacket); 是完全有必要的。

int attribute_align_arg avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt)
{
    AVCodecInternal *avci = avctx->internal;
    int ret;

    if (!avcodec_is_open(avctx) || !av_codec_is_decoder(avctx->codec))
        return AVERROR(EINVAL);

    if (avctx->internal->draining)
        return AVERROR_EOF;

    if (avpkt && !avpkt->size && avpkt->data)
        return AVERROR(EINVAL);

    av_packet_unref(avci->buffer_pkt);
    if (avpkt && (avpkt->data || avpkt->side_data_elems)) {
        ret = av_packet_ref(avci->buffer_pkt, avpkt);
        if (ret < 0)
            return ret;
    }

    ret = av_bsf_send_packet(avci->bsf, avci->buffer_pkt);
    if (ret < 0) {
        av_packet_unref(avci->buffer_pkt);
        return ret;
    }

    if (!avci->buffer_frame->buf[0]) {
        ret = decode_receive_frame_internal(avctx, avci->buffer_frame);
        if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
            return ret;
    }

    return 0;
}

avcodec_receive_frame源码

int attribute_align_arg avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame)
{
    av_frame_unref(frame);

    if (av_codec_is_decoder(avctx->codec))
        return ff_decode_receive_frame(avctx, frame);
    return ff_encode_receive_frame(avctx, frame);
}

上来先将 frame 清空：av_frame_unref(frame); 联想我们在实际代码中总是会在一个循环中 avcodec_receive_frame数据，也就是说，只要不是最后一次，从第一次到中间的任何一次，都会将frame 清空。

那么最后一次的avframe是谁清空的呢？就是我们在自己写的代码中

    av_frame_free(&mp3avframe);

如果是解码操作：执行 ff_decode_receive_frame函数

再看核心函数

av_frame_move_ref(frame, avci->buffer_frame); 给frame中填充数据，并转移ref，count并没有增加。

和核心函数

ret = decode_receive_frame_internal(avctx, frame);

1.2.debug中time_base, pts, dts, duration 几个值如下图

在debug111的时候，我们可以看到 AVFrame 的值如下：

avg_frame_rate 的值是 0 和0 ，这和预期的不一样，avg_frame_rate代表的是平均帧率，理论上再demuxing阶段，通过 avformat_find_stream_info 方法就可以获得了

duration 的值，是3205324800

r_frame_rate 真实基本帧率，是需要通过解码才能知道的帧，且根据不同的文件形式，解码方式也不一样 debug这里值是0 和0.

sample_aspect_ratio是宽和高的比例，当前也是0和0

side_data里面是有值的， type 类型是 AV_PKT_dATA_REPLAYGAIN

start_time 这个流显示开始的时间, 注意是pts时间，也就是显示时间: 353600

time_base:流的时间基准，用于将时间戳转换为实际时间,当前是 1/14112000。计算公式为：时间戳 * timebase = 实际时间

从上述debug的值，我们可以得到：
start time = 353600 * （1/ 14112000 ） = 0.0250566893424036 秒，也就是说，这个stream流开始显示的时间是从0.0250566893424036秒开始
duration = 3205324800*(1/14112000) = 227.134693877551 秒，那么除以60就是分钟 227.134693877551/60 = 3.785578231292517 分钟 = 3分钟 + 0.785578231292517 分钟
0.785578231292517 *60 = 47.13469387755102秒，也就是通过 duration 和 time_base 就可以计算出来这首歌的时间是 3分47秒

--------------------------------

在debug 666 的时候，我们看一下 AVPacket 的值

uint8_t *data：指向保存压缩数据的指针，这就是AVPacket的实际数据。

duration: （int64_t）数据的时长，以所属媒体流的时间基准为单位，未知则值为默认值0
368640，结合AVframe的time_base计算，注意这里，是结合AVFrame 的time_base计算，不是AVPacket 的time_base计算。。368640 *（1/14112000） = 0.0250566893424036秒，也就是说，当前avpacket占用的时间为0.0250566893424036秒。

pts: （int64_t）显示时间，结合AVStream->time_base转换成时间戳
dts: （int64_t）解码时间，结合AVStream->time_base转换成时间戳
size: （int）data的大小
stream_index: （int）packet在stream的index位置
flags: （int）标示，结合AV_PKT_FLAG使用，其中最低为1表示该数据是一个关键帧。

time_base 是在FFMpeg 4.3.2 上还没有的一个值，在6.0上有了这个值，目前作用不详，应用在什么场景下，也不知道。

1.3.附录：在ffmpeg 6.0 AVPacket 中新添加的timebase 说明如下：

整理为：

avpacket 中timebase 的说明：

     Time base of the packet's timestamps.
     In the future, this field may be set on packets output by encoders or demuxers, 
     but its value will be by default ignored on input to decoders or muxers.
     AVRational time_base;
数据包的时间戳的时基。
将来可以在编码器或解复用器输出的分组上设置该字段，
但在解码器或复用器的输入上将默认忽略其值。

1.4.附录：在ffmpeg 6.0 AVFrame 中新添加的timebase 说明如下：

    /**
     * Time base for the timestamps in this frame.
     * In the future, this field may be set on frames output by decoders or filters, but its value will be by default ignored on input to encoders or filters.
     */
    AVRational time_base;
    

*此帧中时间戳的时基。
*将来，该字段可能会在解码器或滤波器输出的帧上设置，但在编码器或滤波器的输入上将默认忽略其值。

1.5. AVStream的time_base


    /**
     * This is the fundamental unit of time (in seconds) in terms of which frame timestamps are represented.
     *
     * decoding: set by libavformat
     * encoding: May be set by the caller before avformat_write_header() to provide a hint to the muxer about the desired timebase.
      In avformat_write_header(), the muxer will overwrite this field with the timebase that will actually be used for the timestamps written into the file (which may or may not be related to the user-provided one, depending on the format).
     */
    AVRational time_base;
*这是表示帧时间戳的基本时间单位（以秒为单位）。
*
*解码：由libavformat设置
*encoding：可以由调用方在avformat_write_header（）之前设置，以向复用器提供有关所需时基的提示。
在avformat_write_header（）中，多路复用器将用实际用于写入文件中的时间戳的时基覆盖此字段（根据格式，可能与用户提供的时间戳有关，也可能与之无关）。

2 整理代码中不同结构体的 time_base

2.1、AVStream的time_base的单位是秒。

每种格式的time_base的值不一样，根据采样来计算，比如mpeg的pts、dts都是以90kHz来采样的，所以采样间隔就是1/900000秒。

2.2、AVCodecContext的time_base单位同样为秒，

不过精度没有AVStream->time_base高，大小为1/framerate。

2.3、AVPacket下的pts和dts以AVStream->time_base为单位(数值比较大)，

时间间隔就是AVStream->time_base。

2.4、AVFrame里面的pkt_pts和pkt_dts是拷贝自AVPacket，同样以AVStream->time_base为单位；

而pts是为输出(显示)准备的，以AVCodecContex->time_base为单位。

2.5、输入流InputStream下的pts和dts以AV_TIME_BASE为单位(微秒)，

至于为什么要转化为微秒，可能是为了避免使用浮点数。

2.6、输出流OutputStream涉及音视频同步，结构和InputStream不同，

暂时只作记录，不分析

3. ffmpeg命令行中的三种时间基 tbr、tbn 和 tbc

不同的封装格式具有不同的时间基。在 FFmpeg 处理音视频过程中的不同阶段，也会采用不同的时间基。
FFmepg 中有三种时间基，命令行中 tbr、tbn 和 tbc 的打印值就是这三种时间基的倒数：

tbn：对应容器中的时间基。值是 AVStream.time_base 的倒数
tbc：对应编解码器中的时间基。值是 AVCodecContext.time_base 的倒数
tbr：从视频流中猜算得到，可能是帧率或场率(帧率的 2 倍)

4 内部时间基 AV_TIME_BASE

除以上三种时间基外，FFmpeg 还有一个内部时间基 AV_TIME_BASE(以及分数形式的 AV_TIME_BASE_Q

// Internal time base represented as integer
#define AV_TIME_BASE            1000000
// Internal time base represented as fractional value
#define AV_TIME_BASE_Q          (AVRational){1, AV_TIME_BASE}
//AV_TIME_BASE 及 AV_TIME_BASE_Q 用于 FFmpeg 内部函数处理，使用此时间基计算得到时间值表示的是微秒。

5.关于时间的各种操作

5.1.时间基的转换，为什么要有时间基转换

首先，不同的封装格式，timebase是不一样的。另外，整个转码过程，不同的数据状态对应的时间基也不一致。
- 拿mpegts封装格式25fps来说（只说视频，音频大致一样，但也略有不同）。
- 非压缩时候的数据（即YUV或者其它），在ffmpeg中对应的结构体为AVFrame,它的时间基为AVCodecContext 的time_base ,AVRational{1,25}。
- 压缩后的数据（对应的结构体为AVPacket）对应的时间基为AVStream的time_base，AVRational{1,90000}。
因为数据状态不同，时间基不一样，所以我们必须转换，在1/25时间刻度下占10格，在1/90000下是占多少格。这就是pts的转换

时间基转换函数
- av_rescale_q(int64_t a, AVRational bq, AVRational cq)函数
- 这个函数的作用是计算a*bq / cq来把时间戳从一个时间基调整到另外一个时间基。在进行时间基转换的时候，应该首先这个函数，因为它可以避免溢出的情况发生。
- 函数表示在bq下的占a个格子，在cq下是多少。

5.2 计算视频总时长

AVFormatContext *ifmt_ctx = NULL;
avformat_open_input(&ifmt_ctx, filename, NULL, NULL);
double totle_seconds = ifmt_ctx->duration * av_q2d(AV_TIME_BASE_Q);


av_q2d函数原型 
static inline double av_q2d(AVRational a){
    return a.num / (double) a.den;
}

AV_TIME_BASE_Q = (AVRational){1, AV_TIME_BASE}

5.3 根据PTS求出一帧在视频中对应的秒数位置

该pts在AVPacket 和 AVFrame中都可以这样实现

double sec = enc_pkt.pts * av_q2d(ofmt_ctx->streams[stream_index]->time_base);

5.4 ffmpeg内部的时间戳与标准的时间转换方法

timestamp(ffmpeg内部时间戳) = AV_TIME_BASE * time(秒)
//其中 AV_TIME_BASE=1000000，其实就是将 单位 秒/s 转换成了 微秒/us
time(秒) = AV_TIME_BASE_Q * timestamp(ffmpeg内部时间戳)
//AV_TIME_BASE_Q=1/AV_TIME_BASE, 即（1微秒=1/1000000 秒）

5.5 当需要把视频Seek到N秒的时候


int av_seek_frame(AVFormatContext *s, int stream_index, int64_t timestamp, int flags);
参数：
  s: AVFormatContext 指针。包含了流媒体的信息。
  stream_index：     流索引，流即视频流，音频流等，视频流索引为0，音频流索引为1 。-1：表示默认流。
  timestamp:         时间戳
  flags：seek标志，有以下四种：
        AVSEEK_FLAG_BACKWARD      是seek到请求的时间戳之前最近的关键帧
        AVSEEK_FLAG_BYTE          是基于字节位置的查找，精确到字节 mp4格式不能使用该标识会seek失败
        AVSEEK_FLAG_ANY           是可以seek到任意帧，不一定是关键帧，可能是p帧，b帧，因此使用时可能会导致花屏    flv格式不能使用该标识会seek失败
        AVSEEK_FLAG_FRAME         是基于帧数量快进 

        参数stream_index为视频流索引时，跳转是以视频时间戳为基准，stream_index为音频流索引时跳转是以音频流时间戳为基准。通常视频流索引为0，音频流索引为1，具体的索引根据打包时的定义确定。


sec(实际时间) = pts*av_q2d(stream[video_index]->time_base);
 
pts = sec/av_q2d(stream[video_index]->time_base);

第二个参数timestamp的详细说明：

这里的timestamp代表的是想要移动到的起始位置的时间戳，注意这里是起始位置的时间戳，不是起始位置的秒数! 通俗地讲，它就是起始位置的pts，因此一个10s的视频，你想移动到5s的位置，直接传5是不对的。在 FFmpeg 中，时间戳(timestamp)的单位是时间基数(time_base)，时间戳值乘以时间基，可以得到实际的时刻值(以秒等为单位)。例如，如果一个视频帧的 dts 是 40，pts 是 160，其 time_base 是 1/1000 秒，那么可以计算出此视频帧的解码时刻是 40 毫秒(40/1000)，显示时刻是 160 毫秒(160/1000)。FFmpeg 中时间戳(pts/dts)的类型是int64_t 类型，如果把一个 time_base 看作一个时钟脉冲，那么 dts/pts 则可以看作是时钟脉冲的计数。

第四个参数下面分别对其进行详细说明：

一. AVSEEK_FLAG_BACKWARD

实际业务中有这样一个场景：用户经常会通过拖动视频底部进度条的方式来跳转到某一帧的位置，那这个时候如何确定当前帧的具体位置呢？有一种办法就是计算当前拖动位置的百分比，然后根据当前视频的总时长，乘以该百分比，得到当前跳转位置的时间，然后根据该时间进行seek操作。

比方说总时长是1000ms, 那如果用户拖动到中间的位置，那就应该seek到500ms的位置，这个时候把500ms传入到上面的函数中进行seek就可以了。可这样有一个问题，你把500ms传入到上面的视频帧当中，是否真的有pts为500ms的视频帧呢？很难，有可能根本就没有pts为500ms的视频帧，这个时候可能有498，也有可能有501的，那到底是取498的呢？还是取501的呢？这个时候就要有一套策略，AVSEEK_FLAG_BACKWARD这个FLAG就相当于标识往后走，也就是找pts为501的视频帧。

二. AVSEEK_FLAG_BYTE

这种对应的另外一种场景：假设我想移动视频到中间的位置，但是当前视频文件却没有索引, 不过我却知道这个视频文件的大小(1M), 那这个时候要移动视频到中间位置的话，其实就是应该对应在500KB左右的位置，AVSEEK_FLAG_BYTE这个FLAG就相当于是根据字节数来移动它的位置。

三. AVSEEK_FLAG_ANY

移动视频到任意帧的位置。也就是说，seek到的位置可能是关键帧，也可能是非关键帧，注意：如果移动到的是非关键帧，这个时候解码可能会失败(因为少了前面的关键帧做参考)，出现的后果是可能会造成花屏。

四. AVSEEK_FLAG_FRAME

移动视频到关键帧位置。与上面的FLAG不同，这种情况下会强制移动到关键帧位置，比如你要移动的位置是500ms, 但是在500ms没有关键帧，但是在前面400ms的位置有关键帧，它就会移到400ms的位置，这样的话画面就能够正常显示。

注意这种策略也有一定的问题，假设当前视频的GOP是100帧，这100帧的时长是4s, 那这样的话拖动到这4s的任意位置，画面可能都不会改变(由于设置了移动到关键帧，而一个GOP中只有第一帧才是关键帧)，这样会给人一个错觉——移动失败了，这点需要注意。

视频和音频跳到相同的时间位置但是流数据帧位置不相同，因为一般情况下视频帧的起始时间戳为0，音频帧的时间戳为-1024 （为啥是-1024 ？这块没有理解，todo）。

跳转到文件的起始位置：当第二个参数流索引为-1时标识默认流索引。未指定指定流索引，可以用-1代表默认的流索引。

av_seek_frame(s, -1, 0, AVSEEK_FLAG_BACKWARD);

5.6 关于音频pts的计算

//音频sample_rate:samples per second，即采样率，表示每秒采集多少采样点。
//比如44100HZ，就是一秒采集44100个sample. 即每个sample的时间是1/44100秒
//一个音频帧的AVFrame有nb_samples个sample，所以一个AVFrame耗时是nb_samples\*（1/44100）秒，即标准时间下duration_s=nb_samples\*（1/44100）秒，
//转换成AVStream时间基下 
duration=duration_s / av_q2d(st->time_base)
//即duration个时间基
//另外，st->time_base的num值一般等于采样率,所以duration=nb_samples. pts=n*duration=n*nb_samples

6. ffmpeg 相关的API说明

6.1 av_q2d()，将时间从 AVRational 形式转换为 double 形式

av_q2d()将时间从 AVRational 形式转换为 double 形式。AVRational 是分数类型，double 是双精度浮点数类型，转换的结果单位是秒。转换前后的值基于同一时间基，仅仅是数值的表现形式不同而已。

av_q2d()实现如下：

/**
 * Convert an AVRational to a `double`.
 * @param a AVRational to convert
 * @return `a` in floating-point form
 * @see av_d2q()
 */
static inline double av_q2d(AVRational a){
    return a.num / (double) a.den;
}

av_q2d()使用方法如下：

AVStream stream;
AVPacket packet;
packet 播放时刻值：timestamp(单位秒) = packet.pts × av_q2d(stream.time_base);
packet 播放时长值：duration(单位秒) = packet.duration × av_q2d(stream.time_base);

6.2 时间基转换函数av_rescale_q

av_rescale_q

/**
 * Rescale a 64-bit integer by 2 rational numbers.
 *
 * The operation is mathematically equivalent to `a × bq / cq`.
 *
 * This function is equivalent to av_rescale_q_rnd() with #AV_ROUND_NEAR_INF.
 *
 * @see av_rescale(), av_rescale_rnd(), av_rescale_q_rnd()
 */
int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq) av_const;

    AVRational src_time_base = (AVRational){1, 25};

    AVRational dst_time_base = (AVRational){1, 90000};

    int64_t pts = 2;

    int64_t new_pts = av_rescale_q(pts, src_time_base , dst_time_base);

6.3 AVPacket 中的时间基的计算av_packet_rescale_ts。

这个是啥意思呢？就是将AVPacket 中的 pts，dts，duration的值都变化了

av_packet_rescale_ts()用于将 AVPacket 中各种时间值从一种时间基转换为另一种时间基。

参数一是 AVPacket* ，也就是将AVPacket 传递进去后，该方法内部会将 该 AVPacket的dts，pts，duration改变，变成tb_dst 时间基下的  dts，pts，duration

/**
 * Convert valid timing fields (timestamps / durations) in a packet from one
 * timebase to another. Timestamps with unknown values (AV_NOPTS_VALUE) will be
 * ignored.
 *
 * @param pkt packet on which the conversion will be performed
 * @param tb_src source timebase, in which the timing fields in pkt are
 *               expressed
 * @param tb_dst destination timebase, to which the timing fields will be
 *               converted
 */
void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst);

参考源码实现

void av_packet_rescale_ts(AVPacket *pkt, AVRational src_tb, AVRational dst_tb)
{
    if (pkt->pts != AV_NOPTS_VALUE)
        pkt->pts = av_rescale_q(pkt->pts, src_tb, dst_tb);
    if (pkt->dts != AV_NOPTS_VALUE)
        pkt->dts = av_rescale_q(pkt->dts, src_tb, dst_tb);
    if (pkt->duration > 0)
        pkt->duration = av_rescale_q(pkt->duration, src_tb, dst_tb);
#if FF_API_CONVERGENCE_DURATION
FF_DISABLE_DEPRECATION_WARNINGS
    if (pkt->convergence_duration > 0)
        pkt->convergence_duration = av_rescale_q(pkt->convergence_duration, src_tb, dst_tb);
FF_ENABLE_DEPRECATION_WARNINGS
#endif
}

我们从一个AVPacket 转到另一个 AVPacket 的时候（场景有啥呢？todo）

6.4 转封装过程中的时间基转换

容器中的时间基(AVStream.time_base,即前面的 tbn)定义如下：

AVStream.time_base 是 AVPacket 中 pts 和 dts 的时间单位，输入流与输出流中 time_base 按如下方式确定：

对于输入流：打开输入文件后，调用 avformat_find_stream_info()可获取到每个流中的 time_base
对于输出流：打开输出文件后，调用 avformat_write_header()可根据输出文件封装格式确定每个流的 time_base 并写入输出文件中

不同封装格式具有不同的时间基，在转封装(将一种封装格式转换为另一种封装格式)过程中，时间基转换相关代码如下：

av_read_frame(ifmt_ctx, &pkt);
pkt.pts = av_rescale_q_rnd(pkt.pts, in_stream->time_base, out_stream->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
pkt.dts = av_rescale_q_rnd(pkt.dts, in_stream->time_base, out_stream->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
pkt.duration = av_rescale_q(pkt.duration, in_stream->time_base, out_stream->time_base);

下面的代码具有和上面代码相同的效果：

// 从输入文件中读取 packet
av_read_frame(ifmt_ctx, &pkt);
// 将 packet 中的各时间值从输入流封装格式时间基转换到输出流封装格式时间基
av_packet_rescale_ts(&pkt, in_stream->time_base, out_stream->time_base);

6.5 转码过程中的时间基转换

编解码器中的时间基(AVCodecContext.time_base，即前面的tbc)定义如下：

typedef struct AVCodecContext {
    ......
    
    /**
     * This is the fundamental unit of time (in seconds) in terms
     * of which frame timestamps are represented. For fixed-fps content,
     * timebase should be 1/framerate and timestamp increments should be
     * identically 1.
     * This often, but not always is the inverse of the frame rate or field rate
     * for video. 1/time_base is not the average frame rate if the frame rate is not
     * constant.
     *
     * Like containers, elementary streams also can store timestamps, 1/time_base
     * is the unit in which these timestamps are specified.
     * As example of such codec time base see ISO/IEC 14496-2:2001(E)
     * vop_time_increment_resolution and fixed_vop_rate
     * (fixed_vop_rate == 0 implies that it is different from the framerate)
     *
     * - encoding: MUST be set by user.
     * - decoding: the use of this field for decoding is deprecated.
     *             Use framerate instead.
     */
    AVRational time_base;
    
    ......
}

上述注释指出，AVCodecContext.time_base 是帧率(视频帧)的倒数，每帧时间戳递增 1，那么 tbc 就等于帧率。编码过程中，应由用户设置好此参数。解码过程中，此参数已过时，建议直接使用帧率倒数用作时间基。

这里有一个问题：按照此处注释说明，帧率为 25 的视频流，tbc 理应为 25，但实际值却为 50，不知作何解释？是否 tbc 已经过时，不具参考意义？

根据注释中的建议，实际使用时，在视频解码过程中，我们不使用 AVCodecContext.time_base，而用帧率倒数作时间基，在视频编码过程中，我们将 AVCodecContext.time_base 设置为帧率的倒数。

6.6 视频流

视频按帧播放，所以解码后的原始视频帧时间基为 1/framerate。

视频解码过程中的时间基转换处理：

AVFormatContext *ifmt_ctx;
AVStream *in_stream;
AVCodecContext *dec_ctx;
AVPacket packet;
AVFrame *frame;

// 从输入文件中读取编码帧
av_read_frame(ifmt_ctx, &packet);

// 时间基转换
int raw_video_time_base = av_inv_q(dec_ctx->framerate);
av_packet_rescale_ts(packet, in_stream->time_base, raw_video_time_base);

// 解码
avcodec_send_packet(dec_ctx, packet)
avcodec_receive_frame(dec_ctx, frame);

视频编码过程中的时间基转换处理：

AVFormatContext *ofmt_ctx;
AVStream *out_stream;
AVCodecContext *dec_ctx;
AVCodecContext *enc_ctx;
AVPacket packet;
AVFrame *frame;

// 编码
avcodec_send_frame(enc_ctx, frame);
avcodec_receive_packet(enc_ctx, packet);

// 时间基转换
packet.stream_index = out_stream_idx;
enc_ctx->time_base = av_inv_q(dec_ctx->framerate);
av_packet_rescale_ts(&opacket, enc_ctx->time_base, out_stream->time_base);

// 将编码帧写入输出媒体文件
av_interleaved_write_frame(o_fmt_ctx, &packet);

6.7音频流

音频按采样点播放，所以解码后的原始音频帧时间基为 1/sample_rate

音频解码过程中的时间基转换处理：

AVFormatContext *ifmt_ctx;
AVStream *in_stream;
AVCodecContext *dec_ctx;
AVPacket packet;
AVFrame *frame;

// 从输入文件中读取编码帧
av_read_frame(ifmt_ctx, &packet);

// 时间基转换
int raw_audio_time_base = av_inv_q(dec_ctx->sample_rate);
av_packet_rescale_ts(packet, in_stream->time_base, raw_audio_time_base);

// 解码
avcodec_send_packet(dec_ctx, packet)
avcodec_receive_frame(dec_ctx, frame);

音频编码过程中的时间基转换处理：

AVFormatContext *ofmt_ctx;
AVStream *out_stream;
AVCodecContext *dec_ctx;
AVCodecContext *enc_ctx;
AVPacket packet;
AVFrame *frame;

// 编码
avcodec_send_frame(enc_ctx, frame);
avcodec_receive_packet(enc_ctx, packet);

// 时间基转换
packet.stream_index = out_stream_idx;
enc_ctx->time_base = av_inv_q(dec_ctx->sample_rate);
av_packet_rescale_ts(&opacket, enc_ctx->time_base, out_stream->time_base);

// 将编码帧写入输出媒体文件
av_interleaved_write_frame(o_fmt_ctx, &packet);

6.8 音视频时间戳比较函数 av_compare_ts(）

int av_compare_ts(int64_t ts_a, AVRational tb_a, int64_t ts_b, AVRational tb_b)  
{  
    int64_t a = tb_a.num * (int64_t)tb_b.den;  
    int64_t b = tb_b.num * (int64_t)tb_a.den;  
    if ((FFABS(ts_a)|a|FFABS(ts_b)|b) <= INT_MAX)  
        return (ts_a*a > ts_b*b) - (ts_a*a < ts_b*b);  
    if (av_rescale_rnd(ts_a, a, b, AV_ROUND_DOWN) < ts_b)  
        return -1;  
    if (av_rescale_rnd(ts_b, b, a, AV_ROUND_DOWN) < ts_a)  
        return 1;  
    return 0;  
}

【参数分析】

ts_a：frame a的pts
tb_a：a对应的时间基
ts_b：frame b的pts
tb_b：b对应的时间基

【返回值】

如果a在b之前，返回 -1（即a先显示）
如果a在b之后，返回 1
如果位置相同，返回 0

应用时机为：

	while (1) {
		AVFormatContext *ifmt_ctx;
		int stream_index=0;
		AVStream *in_stream, *out_stream;
		//Get an AVPacket //2 av_compare_ts比较音视频pts，大于0表示视频帧在前，音频需要连续编码。小于0表示，音频帧在前，应该至少编码一帧视频
		if(av_compare_ts(cur_pts_v,ifmt_ctx_v->streams[videoindex_v]->time_base,cur_pts_a,ifmt_ctx_a->streams[audioindex_a]->time_base) <= 0){
			ifmt_ctx=ifmt_ctx_v;
			stream_index=videoindex_out;
 
			if(av_read_frame(ifmt_ctx, &pkt) >= 0){
				do{
					in_stream  = ifmt_ctx->streams[pkt.stream_index];
					out_stream = ofmt_ctx->streams[stream_index];
 
					if(pkt.stream_index==videoindex_v){
						//FIX：No PTS (Example: Raw H.264)
						//Simple Write PTS
						if(pkt.pts==AV_NOPTS_VALUE){
							//Write PTS
							AVRational time_base1=in_stream->time_base;
							//Duration between 2 frames (us)//按ffmpeg中的1秒(即90000)来计算每帧的间隔；90000 / 25 = 3600(ffmpeg)
							int64_t calc_duration=(double)AV_TIME_BASE/av_q2d(in_stream->r_frame_rate);
							//Parameters//计算一桢在整个视频中的时间位置timestamp(秒) = pts * av_q2d(st->time_base);
							pkt.pts=(double)(frame_index*calc_duration)/(double)(av_q2d(time_base1)*AV_TIME_BASE);
							pkt.dts=pkt.pts;
							pkt.duration=(double)calc_duration/(double)(av_q2d(time_base1)*AV_TIME_BASE);
							frame_index++;
						}
 
						cur_pts_v=pkt.pts;
                        //注意：读取一帧之后break
						break;
					}
				}while(av_read_frame(ifmt_ctx, &pkt) >= 0);
			}
            else { //skip audio
                    ...
                }
		}