ffmpeg解封装mov/mp4格式解封装源码分析之mov_read_header(读取metadata)、mov_read_packet(读取sample数据)、mov_read_trak

最新推荐文章于 2024-05-15 11:29:46 发布

一个大龄程序员

最新推荐文章于 2024-05-15 11:29:46 发布

阅读量2k

点赞数 4

分类专栏： mov源码分析 ffmpeg解封装MP4、mov mov_read_packet 文章标签： ffmpeg 音频编码解码 mov

本文链接：https://blog.csdn.net/qq_39683826/article/details/115342493

版权

mov源码分析同时被 3 个专栏收录

2 篇文章 1 订阅

订阅专栏

ffmpeg解封装MP4、mov

2 篇文章 2 订阅

订阅专栏

mov_read_packet

1 篇文章 0 订阅

订阅专栏

最近在研究ffmpeg，发现网上关于ffmpeg解封装的源码分析不多而且不全，所以这里总结一下，我自己对ffmpeg解封装mov、mp4格式的源码分析主要就是关于mov.c的源码分析，让各位同学了解一下，mp4的流AVStream、AVPacket是如何赋值的，这两个结构体变量会是解码的输入数据，了解解封装过程，如pts、dts是如何得到的，有助于ffmpeg的二次开发。关于MP4协议，网上有详细的描述，我这里就不在写了，直接上干货，代码书注释。

所有接口和数据结构写的都很详细，但是研究了好一阵，写起来超级麻烦，好累的，看完给小弟点个关注呗哈哈哈哈哈

重点小Tips:

ffmpeg中很多结构体(AVStream、URLContext、AVFormatContext)很喜欢用void *priv_data变量
其实这个变量是用来存储该结构体 “子结构体”的，如AVStream中的priv_data就是用来存储mov协议MOVStreamContext结构体变量的，URLContext中的priv_data就是用来存储file协议中FileContext结构体的，这样做其实是为了分离协议接口功能或数据和主干接口，使整个库有可扩展性。所以你会发现在各种协议接口的开始都会讲主干的priv_data赋值给协议自身的结构体中。如：mov_read_header当中的
MOVContext *mov = s->priv_data;这样书写，也是一种语法糖，sc 不会受priv_data名称的影响。
即使外部变量如命名有变化也会很少的影响内部接口。ffmpeg的接口大多都用到这种方式，尤其是涉及到一些外部协议
rtmp流媒体、file文件、mov格式等。
对于Context这种命名比如：URLContext、FileContext、AVFormatContext等我个人的理解就是要完成功能所需要的数据+方法（接口）。如URLContext当中就有 file协议FileContext结构体里面有 open、close、read等方法和uint*data变量用来存储从文件当中读取的数据。这里是一级一级存储的，为了代码有更好的扩展性，这种库是好多人写的呀。不知道我解释清楚没有，哈哈哈哈哈哈哈。
对于internal这种命名如AVStreamInternal，一般是用来存储数据并传递给接口使用的
因为多媒体文件都是字节流形式所以接口 AV_RL32读取4个字节以大端方式读取 av_rl32以小端方式读取

在代码中atom,其实就是MP4协议中的box，代码中或协议中经常提到的sample实际上是音视频的一帧，这个提醒一下

MP4格式是以BOX的形式存储数据的，包括嵌套box，所有的box都是由size+type+body三部分组成。在最外层的BOX中主要有ftyp、moov、mdat三种类型的BOX。ftyp主要为MP4格式的标识信息，moov为这个MP4文件sample数据的metadata，用来描述sample大小，位置，dts等，mdat为sample具体数据，每个sample数据在文件中的位置，是通过moov中的trak得到的。具体可以看ffmpeg解封装mov/mp4格式解封装源码分析之trak box(atom)模块mov_read_stsd(stts)、(stss)、(ctts)、(stsc)、(stsz)、(stco)接口分析一章中有介绍

mp4 box的解析其实就是读取各种类型box，将里面的一些数据通过file_read读取并赋值。但有些只是一些标识的信息，在此我就不做过多的描述了，只是重点介绍跟解码有关的重要接口，剩下的一笔带过了。

重点接口哈

重点小Tips:
这里所有的接口在开始的时候都会写：

MOVContext *mov = s->priv_data;  //AVClass类型 memset 为0
AVIOContext *pb = s->pb;

这样写的好处是，接口有扩展性，所有接口的形参都是(AVFormatContext *s)，最外层的AVFormatContext的priv_data被赋值到MOVContext 当中，传入接口方便书写，这样即使外部变量如命名有变化也会很少的影响内部接口。ffmpeg的接口大多都用到这种方式，尤其是涉及到一些外部协议rtmp流媒体、file文件、mov格式等。

//在代码中atom,其实就是MP4协议中的box
typedef struct MOVAtom {
    uint32_t type;//box类型
    int64_t size; /* box整体大小（size+type+body三部分） total size (excluding the size and type fields) */
} MOVAtom;

static int mov_read_header(AVFormatContext *s)
{
    MOVContext *mov = s->priv_data;  //AVClass类型 memset 为0
    AVIOContext *pb = s->pb;
    int j, err;
    MOVAtom atom = { AV_RL32("root") }; //创建父box,包含ftyp、moov、mdat三种类型的box并赋初值
    int i;
    ...
    mov->fc = s; //将AVFormatContext 赋值给MOVContext 方便书写，代码具有扩展性
    mov->trak_index = -1;
    /* .mov and .mp4 aren't streamable anyway (only progressive download if moov is before mdat) */
    if (pb->seekable & AVIO_SEEKABLE_NORMAL)
        atom.size = avio_size(pb); //源MP4文件整体大小
    else
        atom.size = INT64_MAX;

    /* check MOV header */
    //当读完moov的box时跳出，但这里只读一次，因为有mov_read_default接口会不断向下读取嵌套box,只要atom.size还有足够数据
    do {
        if (mov->moov_retry)
            avio_seek(pb, 0, SEEK_SET);
            //有嵌套BOX，继续往下读
        if ((err = mov_read_default(mov, pb, atom)) < 0) {
            av_log(s, AV_LOG_ERROR, "error reading header\n");
            goto fail;
        }
    } while ((pb->seekable & AVIO_SEEKABLE_NORMAL) && !mov->found_moov && !mov->moov_retry++);
    if (!mov->found_moov) { //moov是否读完标志
        av_log(s, AV_LOG_ERROR, "moov atom not found\n");
        err = AVERROR_INVALIDDATA;
        goto fail;
    }
    av_log(mov->fc, AV_LOG_TRACE, "on_parse_exit_offset=%"PRId64"\n", avio_tell(pb));

    if (pb->seekable & AVIO_SEEKABLE_NORMAL) {
        ......
    }

    /* copy timecode metadata from tmcd tracks to the related video streams */
    for (i = 0; i < s->nb_streams; i++) {
        ...
    }
    export_orphan_timecode(s);

    //s->streams是在读取trak box开的内存并赋值的
    for (i = 0; i < s->nb_streams; i++) {
        AVStream *st = s->streams[i];
        MOVStreamContext *sc = st->priv_data;
        fix_timescale(mov, sc); //mdhd中的时间缩放比例，一般为1
        if (st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO &&
            st->codecpar->codec_id   == AV_CODEC_ID_AAC) {
            st->internal->skip_samples = sc->start_pad;
        }
        if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO && sc->nb_frames_for_fps > 0 && sc->duration_for_fps > 0)
            av_reduce(&st->avg_frame_rate.num, &st->avg_frame_rate.den,
                      sc->time_scale*(int64_t)sc->nb_frames_for_fps, sc->duration_for_fps, INT_MAX);
        if (st->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
            if (st->codecpar->width <= 0 || st->codecpar->height <= 0) {
                st->codecpar->width  = sc->width;
                st->codecpar->height = sc->height;
            }
            if (st->codecpar->codec_id == AV_CODEC_ID_DVD_SUBTITLE) {
                if ((err = mov_rewrite_dvd_sub_extradata(st)) < 0)
                    goto fail;
            }
        }
        ...
    }
    if (mov->trex_data) {
        ...
    }
    if (mov->use_mfra_for > 0) {
        ...
    }
    for (i = 0; i < mov->bitrates_count && i < s->nb_streams; i++) {
        if (mov->bitrates[i]) {
            s->streams[i]->codecpar->bit_rate = mov->bitrates[i];//计算码率
        }
    }

    ff_rfps_calculate(s);

    for (i = 0; i < s->nb_streams; i++) {
        AVStream *st = s->streams[i];
        MOVStreamContext *sc = st->priv_data;

        switch (st->codecpar->codec_type) {
        case AVMEDIA_TYPE_AUDIO:
            err = ff_replaygain_export(st, s->metadata);
            if (err < 0)
                goto fail;
            break;
        case AVMEDIA_TYPE_VIDEO:
            ...
    }
    ff_configure_buffers_for_index(s, AV_TIME_BASE);

    for (i = 0; i < mov->frag_index.nb_items; i++)
        if (mov->frag_index.item[i].moof_offset <= mov->fragment.moof_offset)
            mov->frag_index.item[i].headers_read = 1;

    return 0;
fail:
    mov_read_close(s);
    return err;
}

//将所有解析atom的接口都注册到一起
typedef struct MOVParseTableEntry {
    uint32_t type;
    int (*parse)(MOVContext *ctx, AVIOContext *pb, MOVAtom atom);
} MOVParseTableEntry;

static const MOVParseTableEntry mov_default_parse_table[] = {
{ MKTAG('f','t','y','p'), mov_read_ftyp },
{ MKTAG('t','r','a','k'), mov_read_trak },
{ MKTAG('t','r','a','f'), mov_read_default },
{ MKTAG('s','t','s','c'), mov_read_stsc },
{ MKTAG('s','t','s','d'), mov_read_stsd }, /* sample description */
{ MKTAG('s','t','s','s'), mov_read_stss }, /* sync sample */
{ MKTAG('s','t','s','z'), mov_read_stsz }, /* sample size */
{ MKTAG('s','t','t','s'), mov_read_stts },
{ MKTAG('s','t','z','2'), mov_read_stsz }, /* compact sample size */
.....
}

//这个接口是最重要的接口，主要是负责读取嵌套的box，里面有循环代码，只要atom.size数据足够多的
//比如 moov->trak->mdia->stbl->stsd、stts、stss、ctts、stco等
//所以上面mov_read_header接口中只用执行一次mov_read_default就行了，就是从创建的父box开始循环读取嵌套box
static int mov_read_default(MOVContext *c, AVIOContext *pb, MOVAtom atom)
{
    int64_t total_size = 0;//读取一个atom(无论是否是嵌套box),已经读取的字节数
    MOVAtom a;
    int i;

    if (c->atom_depth > 10) {
        av_log(c->fc, AV_LOG_ERROR, "Atoms too deeply nested\n");
        return AVERROR_INVALIDDATA;
    }
    c->atom_depth ++; //atom读取层数

    if (atom.size < 0)
        atom.size = INT64_MAX;
    while (total_size <= atom.size - 8 && !avio_feof(pb)) {
        int (*parse)(MOVContext*, AVIOContext*, MOVAtom) = NULL;
        a.size = atom.size;
        a.type=0;
        if (atom.size >= 8) {
            a.size = avio_rb32(pb);//该box大小（size+type+body大小）
            a.type = avio_rl32(pb);//该box类型
            //类型比对
            if (((a.type == MKTAG('f','r','e','e') && c->moov_retry) ||
                  a.type == MKTAG('h','o','o','v')) &&
                a.size >= 8 &&
                c->fc->strict_std_compliance < FF_COMPLIANCE_STRICT) {
                uint32_t type;
                avio_skip(pb, 4);
                type = avio_rl32(pb);
                avio_seek(pb, -8, SEEK_CUR);
                if (type == MKTAG('m','v','h','d') ||
                    type == MKTAG('c','m','o','v')) {
                    av_log(c->fc, AV_LOG_ERROR, "Detected moov in a free or hoov atom.\n");
                    a.type = MKTAG('m','o','o','v');
                }
            }
            if (atom.type != MKTAG('r','o','o','t') &&
                atom.type != MKTAG('m','o','o','v')) {
                if (a.type == MKTAG('t','r','a','k') ||
                    a.type == MKTAG('m','d','a','t')) {
                    av_log(c->fc, AV_LOG_ERROR, "Broken file, trak/mdat not at top-level\n");
                    avio_skip(pb, -8);
                    c->atom_depth --;
                    return 0;
                }
            }
            total_size += 8; //已经读了8个字节
            if (a.size == 1 && total_size + 8 <= atom.size) { /* 64 bit extended size */
                a.size = avio_rb64(pb) - 8;
                total_size += 8;
            }
        }
        av_log(c->fc, AV_LOG_TRACE, "type:'%s' parent:'%s' sz: %"PRId64" %"PRId64" %"PRId64"\n",
               av_fourcc2str(a.type), av_fourcc2str(atom.type), a.size, total_size, atom.size);
        if (a.size == 0) {
            a.size = atom.size - total_size + 8;
        }
        a.size -= 8;
        if (a.size < 0)
            break;
        a.size = FFMIN(a.size, atom.size - total_size);

       //通过比对将对应的解析box接口进行赋值
        for (i = 0; mov_default_parse_table[i].type; i++)
            if (mov_default_parse_table[i].type == a.type) {
                parse = mov_default_parse_table[i].parse;
                break;
            }

        // container is user data
        if (!parse && (atom.type == MKTAG('u','d','t','a') ||
                       atom.type == MKTAG('i','l','s','t')))
            parse = mov_read_udta_string;

        // Supports parsing the QuickTime Metadata Keys.
        // https://developer.apple.com/library/mac/documentation/QuickTime/QTFF/Metadata/Metadata.html
        if (!parse && c->found_hdlr_mdta &&
            atom.type == MKTAG('m','e','t','a') &&
            a.type == MKTAG('k','e','y','s') &&
            c->meta_keys_count == 0) {
            parse = mov_read_keys;
        }

        if (!parse) { /* skip leaf atoms data */
            avio_skip(pb, a.size);
        } else {
            int64_t start_pos = avio_tell(pb);//当前文件指针位置和文件首地址的偏移，为了验证该box或atom是否已经读完
            int64_t left;
            int err = parse(c, pb, a);
            if (err < 0) {
                c->atom_depth --;
                return err;
            }
            if (c->found_moov && c->found_mdat &&
                ((!(pb->seekable & AVIO_SEEKABLE_NORMAL) || c->fc->flags & AVFMT_FLAG_IGNIDX || c->frag_index.complete) ||
                 start_pos + a.size == avio_size(pb))) {
                if (!(pb->seekable & AVIO_SEEKABLE_NORMAL) || c->fc->flags & AVFMT_FLAG_IGNIDX || c->frag_index.complete)
                    c->next_root_atom = start_pos + a.size;
                c->atom_depth --;
                return 0;
            }
            left = a.size - avio_tell(pb) + start_pos;//验证 该box或atom是否已经读完
            if (left > 0) /* skip garbage at atom end */
                avio_skip(pb, left);
            else if (left < 0) {
                av_log(c->fc, AV_LOG_WARNING,
                       "overread end of atom '%s' by %"PRId64" bytes\n",
                       av_fourcc2str(a.type), -left);
                avio_seek(pb, left, SEEK_CUR);
            }
        }

        total_size += a.size;//一个atom已经读完
    }

    if (total_size < atom.size && atom.size < 0x7ffff)
        avio_skip(pb, atom.size - total_size);

    c->atom_depth --;
    return 0;
}

//moov box的head box
static int mov_read_mvhd(MOVContext *c, AVIOContext *pb, MOVAtom atom)
{
    int i;
    int64_t creation_time;
    int version = avio_r8(pb); /* version */
    avio_rb24(pb); /* flags */

    if (version == 1) {
        creation_time = avio_rb64(pb);//创建时间
        avio_rb64(pb);
    } else {
        creation_time = avio_rb32(pb);
        avio_rb32(pb); /* modification time */
    }
    mov_metadata_creation_time(&c->fc->metadata, creation_time, c->fc);
    c->time_scale = avio_rb32(pb); /* time scale */
    if (c->time_scale <= 0) {
        av_log(c->fc, AV_LOG_ERROR, "Invalid mvhd time scale %d, defaulting to 1\n", c->time_scale);
        c->time_scale = 1;
    }
    av_log(c->fc, AV_LOG_TRACE, "time scale = %i\n", c->time_scale);

    c->duration = (version == 1) ? avio_rb64(pb) : avio_rb32(pb); /* duration */
    // set the AVFormatContext duration because the duration of individual tracks
    // may be inaccurate
    if (c->time_scale > 0 && !c->trex_data)
        c->fc->duration = av_rescale(c->duration, AV_TIME_BASE, c->time_scale);
    avio_rb32(pb); /* preferred scale */

    avio_rb16(pb); /* preferred volume */

    avio_skip(pb, 10); /* reserved */

    /* movie display matrix, store it in main context and use it later on */
    for (i = 0; i < 3; i++) {
        c->movie_display_matrix[i][0] = avio_rb32(pb); // 16.16 fixed point
        c->movie_display_matrix[i][1] = avio_rb32(pb); // 16.16 fixed point
        c->movie_display_matrix[i][2] = avio_rb32(pb); //  2.30 fixed point
    }

    avio_rb32(pb); /* preview time */
    avio_rb32(pb); /* preview duration */
    avio_rb32(pb); /* poster time */
    avio_rb32(pb); /* selection time */
    avio_rb32(pb); /* selection duration */
    avio_rb32(pb); /* current time */
    avio_rb32(pb); /* next track ID */

    return 0;
}

这里最重要说一下mov_read_trak接口，trak box存储sample数据的metadata，用来描述sample大小，位置，dts等，mdat为sample具体数据，每个sample数据在文件中的位置，是通过moov中的trak得到的。
具体可以看ffmpeg解封装mov/mp4格式解封装源码分析之trak box(atom)模块mov_read_stsd(stts)、(stss)、(ctts)、(stsc)、(stsz)、(stco)接口分析一章中有介绍

static int mov_read_trak(MOVContext *c, AVIOContext *pb, MOVAtom atom)
{
    AVStream *st;
    MOVStreamContext *sc;
    int ret;

    st = avformat_new_stream(c->fc, NULL);//开一个AVStream内存+初始化 ps:一般有两个stream 视频和音频stream
    if (!st) return AVERROR(ENOMEM);
    st->id = -1;
    sc = av_mallocz(sizeof(MOVStreamContext));
    if (!sc) return AVERROR(ENOMEM);

    st->priv_data = sc;//priv_data 被赋值为了在后面的接口当中方便传参
    st->codecpar->codec_type = AVMEDIA_TYPE_DATA;
    sc->ffindex = st->index;
    c->trak_index = st->index;

   //trak是一个嵌套box,最下层stsd stts stss ctts stco等box，在另一个博文中有写到，在这里需要继续往下层读
    if ((ret = mov_read_default(c, pb, atom)) < 0)
        return ret;

    c->trak_index = -1;

    // Here stsc refers to a chunk not described in stco. This is technically invalid,
    // but we can overlook it (clearing stsc) whenever stts_count == 0 (indicating no samples).
    if (!sc->chunk_count && !sc->stts_count && sc->stsc_count) {
        sc->stsc_count = 0;
        av_freep(&sc->stsc_data);
    }

    /* sanity checks */
    if ((sc->chunk_count && (!sc->stts_count || !sc->stsc_count ||
                            (!sc->sample_size && !sc->sample_count))) ||
        (!sc->chunk_count && sc->sample_count)) {
        av_log(c->fc, AV_LOG_ERROR, "stream %d, missing mandatory atoms, broken header\n",
               st->index);
        return 0;
    }
    if (sc->stsc_count && sc->stsc_data[ sc->stsc_count - 1 ].first > sc->chunk_count) {
        av_log(c->fc, AV_LOG_ERROR, "stream %d, contradictionary STSC and STCO\n",
               st->index);
        return AVERROR_INVALIDDATA;
    }

    fix_timescale(c, sc);//sc->time_scale在mvhd中确定

    //这个接口挺容易忽略的，作用是将mdhd中读取的sc->time_scale赋值给上层AVStream*st->time_base
    //st->time_base就是从这里来的，一直以来都忽略了这个接口，哈哈哈
    avpriv_set_pts_info(st, 64, 1, sc->time_scale);

   //提取每个sample的信息包括位置，大小，dts,pts等赋值给AVIndexEntry
    mov_build_index(c, st);

    if (sc->dref_id-1 < sc->drefs_count && sc->drefs[sc->dref_id-1].path) {
        MOVDref *dref = &sc->drefs[sc->dref_id - 1];
        if (c->enable_drefs) {
            ....
        }
    } else {
        sc->pb = c->fc->pb;
        sc->pb_is_copied = 1;
    }

    if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
        if (!st->sample_aspect_ratio.num && st->codecpar->width && st->codecpar->height &&
            sc->height && sc->width &&
            (st->codecpar->width != sc->width || st->codecpar->height != sc->height)) {
            st->sample_aspect_ratio = av_d2q(((double)st->codecpar->height * sc->width) /
                                             ((double)st->codecpar->width * sc->height), INT_MAX);
        }

#if FF_API_R_FRAME_RATE
        if (sc->stts_count == 1 || (sc->stts_count == 2 && sc->stts_data[1].count == 1))
            av_reduce(&st->r_frame_rate.num, &st->r_frame_rate.den,
                      sc->time_scale, sc->stts_data[0].duration, INT_MAX);
#endif
    }

    // done for ai5q, ai52, ai55, ai1q, ai12 and ai15.
    if (!st->codecpar->extradata_size && st->codecpar->codec_id == AV_CODEC_ID_H264 &&
        TAG_IS_AVCI(st->codecpar->codec_tag)) {
        ret = ff_generate_avci_extradata(st);
        if (ret < 0)
            return ret;
    }

    switch (st->codecpar->codec_id) {
#if CONFIG_H261_DECODER
    case AV_CODEC_ID_H261:
#endif
#if CONFIG_H263_DECODER
    case AV_CODEC_ID_H263:
#endif
#if CONFIG_MPEG4_DECODER
    case AV_CODEC_ID_MPEG4:
#endif
        st->codecpar->width = 0; /* let decoder init width/height */
        st->codecpar->height= 0;
        break;
    }...
    /* Do not need those anymore. */
    av_freep(&sc->chunk_offsets);
    av_freep(&sc->sample_sizes);
    av_freep(&sc->keyframes);
    av_freep(&sc->stts_data);
    av_freep(&sc->stps_data);
    av_freep(&sc->elst_data);
    av_freep(&sc->rap_group);

    return 0;
}

重要接口哈
提取每个sample的信息包括位置，大小，dts,pts等赋值给AVIndexEntry

//mov格式stream的结构体，里面存有音视频的sample信息(大小、序号、关键帧等)，但是一般音视频
//没有那么多种类的atom,这里我主要以电影为主，所有的变量注释都是解析电影MP4格式所用到的剩下的一般用不到
//结构体中很喜欢用指针数组来表示序列如：int *keyframes关键帧序列
//Entry是MP4格式协议中的一种概念，你可以当它是一种结构体，就像MP4是以box（atom）作为存储的概念一样
typedef struct MOVStreamContext {
    AVIOContext *pb;
    int pb_is_copied;
    int ffindex;          //< AVStream index 一般为（0或1 也就是音频或视频）
    int next_chunk;
    unsigned int chunk_count; //chunk 的总数(一个chunk中有几个sample)
    int64_t *chunk_offsets;   //stco 每个chunk相对整体文件的绝对偏移（即相对整个文件头的位置）
                              //为了找到每个chunk，不依靠其他参数
    unsigned int stts_count; //sample的dts信息stts Entry 个数
    MOVStts *stts_data;//sample的dts信息stts data结构
    
    typedef struct MOVStts {
    unsigned int count; //相同duration的sample数量
    int duration; //每个sample的dts的偏差值 也就是 delta增量
    } MOVStts;

    unsigned int ctts_count; //sample的dts和pts偏移量信息 ctts Entry 个数
    //开始读取ctts atom 的时候ctts_count为ctts entry结构的个数
    //但是经过mov_build_index接口需要重新给ctts_data赋值时(因为有的ctts不止一个sample所以总的ctts_count会少于sample数量)，ctts_count为sample数量
    unsigned int ctts_allocated_size; //已经分配ctts个数
    MOVStts *ctts_data;//sample的dts和pts偏移量信息ctts data结构
    unsigned int stsc_count; //chunk中有多少sample的信息stsc entry结构的个数(注意：不是chunk数目，因为stsc得到的是chunk的序号)
    MOVStsc *stsc_data;//chunk中有多少sample的息 stsc data结构
    
    typedef struct MOVStsc {
    int first;//chunk 中的第一个sample的id （一个chunk中有一个或多个sample）
    int count;//每个chunk中sample数量
    int id;   //Sample description 一般为1
   } MOVStsc;

    unsigned int stsc_index; //stts_data数组下表
    int stsc_sample;
    unsigned int stps_count;
    unsigned *stps_data;  ///< partial sync sample for mpeg-2 open gop
    MOVElst *elst_data; //决定第一个sample的DTS信息 edit list 数据

   typedef struct MOVElst {
    int64_t duration;//sample的总时间 
    int64_t time;//sample的dts起始值(取time的负数就是dts的第一个值)
    float rate; //sample rate 一般为1
   } MOVElst;

    unsigned int elst_count; //elst entry结构的个数 (一般为1)
    int ctts_index;//ctts_data数组下表
    int ctts_sample;
    unsigned int sample_size;//如果所有sample值相同就是这个值，否则sample_size==0< may contain value calculated from stsd or value from stsz atom
    unsigned int stsz_sample_size; 如果所有sample值相同就是这个值，否则stsz_sample_size==0< always contains sample size from stsz atom
    unsigned int sample_count;//所有帧个数(帧总数)
    int *sample_sizes; //每一个帧的大小
    int keyframe_absent; //是否不要关键帧
    unsigned int keyframe_count; //关键帧个数
    int *keyframes; //关键帧数组(以int类型指针数组形式存储关键帧)
    int time_scale; //mdhd box中的时间缩放比例
    int64_t time_offset;  //sample的dts起始值(取time_offset的负数就是dts的第一个值)
    int64_t min_corrected_pts;  //sample的dts起始值(取time的负数就是dts的第一个值)< minimum Composition time shown by the edits excluding empty edits.
    int current_sample; //当前sample序号
    int64_t current_index;//当前sample序号
    MOVIndexRange* index_ranges;
    MOVIndexRange* current_index_range;
    unsigned int bytes_per_frame; //音频需要的数据(AAC格式一般用不到)
    unsigned int samples_per_frame;//音频需要的数据(AAC格式一般用不到)
    int dv_audio_container;
    int pseudo_stream_id; //stsd Entry数目一般为1    -1 means demux all ids
    int16_t audio_cid;    ///< stsd audio compression id
    unsigned drefs_count;
    MOVDref *drefs; //和网络流媒体有关
    int dref_id;
    int timecode_track;
    int width;            ///< tkhd width
    int height;           ///< tkhd height
    int dts_shift;        //一般值为0 dts shift when ctts is negative
    uint32_t palette[256];
    int has_palette;
    int64_t data_size; //所有帧大小总和
    uint32_t tmcd_flags;  ///< tmcd track flags
    int64_t track_end;  //帧结尾位置(也就是duration时间总数)  ///< used for dts generation in fragmented movie files
    int start_pad;        ///< amount of samples to skip due to enc-dec delay
    unsigned int rap_group_count;
    MOVSbgp *rap_group;

    int nb_frames_for_fps; //所有帧个数(帧总数)
    int64_t duration_for_fps; //所有帧的持续时间总和

    /** extradata array (and size) for multiple stsd */
    uint8_t **extradata;
    int *extradata_size;
    int last_stsd_index;
    int stsd_count;//stsd Entry个数
    int stsd_version;//stsd 版本

    int32_t *display_matrix;//视频矩阵
    AVStereo3D *stereo3d;
    AVSphericalMapping *spherical;
    size_t spherical_size;
    AVMasteringDisplayMetadata *mastering;
    AVContentLightMetadata *coll;
    size_t coll_size;

    uint32_t format; //编码格式

    int has_sidx;  // If there is an sidx entry for this stream.
    struct {
        struct AVAESCTR* aes_ctr;
        unsigned int per_sample_iv_size;  // Either 0, 8, or 16.
        AVEncryptionInfo *default_encrypted_sample;
        MOVEncryptionIndex *encryption_index;
    } cenc;
} MOVStreamContext;

//解封装后将每个sample信息存入这个结构体中，在根据这里的信息（sample大小，位置等）读取音视频文件，并赋值给AVPacket
typedef struct AVIndexEntry {
    int64_t pos;              //sample相对文件头偏差值也就是整个文件的绝对位置（通过stco的chunk偏差值+每个sample大小）
    int64_t timestamp;        //sample的dts(当没有ctts时，dts默认pts,直接将timestamp赋值给ptk->pts)
                               /** Timestamp in AVStream.time_base units, preferably the time from which on correctly decoded frames are available
                               * when seeking to this entry. That means preferable PTS on keyframe based formats.
                               * But demuxers can choose to store a different timestamp, if it is more convenient for the implementation or nothing better
                               * is known
                               */
#define AVINDEX_KEYFRAME 0x0001
#define AVINDEX_DISCARD_FRAME  0x0002    /**
                                          * Flag is used to indicate which frame should be discarded after decoding.
                                          */
    int flags:2;   //是否是关键帧
    int size:30;  //每个sample大小  trying to keep the size of this small to reduce memory requirements (it is 24 vs. 32 bytes due to possible 8-byte alignment).
    int min_distance; /** min_distance==0 该sample为关键帧  < Minimum distance between this and the previous keyframe, used to avoid unneeded searching. */
} AVIndexEntry;

typedef struct MOVElst {
    int64_t duration;//sample的总时间 
    int64_t time;//sample的dts起始值(取time的负数就是dts的第一个值)
    float rate; //sample rate 一般为1
   } MOVElst;
   
typedef struct MOVStts {
    unsigned int count; //相同duration的sample数量
    int duration; //每个sample的dts的偏差值 也就是 delta增量
} MOVStts;

typedef struct MOVStsc {
    int first;//chunk 中的第一个sample的id （一个chunk中有一个或多个sample）
    int count;//每个chunk中sample数量
    int id;   //Sample description 一般为1
   } MOVStsc;
   
static void mov_build_index(MOVContext *mov, AVStream *st)
{
    MOVStreamContext *sc = st->priv_data;
    int64_t current_offset; //sample的相对文件头偏差值
    int64_t current_dts = 0; //当前sample的dts(第一个dts为负数)
    unsigned int stts_index = 0;
    unsigned int stsc_index = 0;
    unsigned int stss_index = 0;//关键帧序号
    unsigned int stps_index = 0;
    unsigned int i, j;
    uint64_t stream_size = 0;//sample 总体大小
    MOVStts *ctts_data_old = sc->ctts_data;
    unsigned int ctts_count_old = sc->ctts_count; //ctts entry结构的个数

    if (sc->elst_count) {
        int i, edit_start_index = 0, multiple_edits = 0;
        int64_t empty_duration = 0; // empty duration of the first edit list entry
        int64_t start_time = 0; //第一个dts start time of the media

        for (i = 0; i < sc->elst_count; i++) {//elst_count 一般为1
            const MOVElst *e = &sc->elst_data[i];
            if (i == 0 && e->time == -1) {
                /* if empty, the first entry is the start time of the stream
                 * relative to the presentation itself */
                empty_duration = e->duration;
                edit_start_index = 1;
            } else if (i == edit_start_index && e->time >= 0) {
                start_time = e->time;//第一个dts
            } else {
                multiple_edits = 1;
            }
        }

        if (multiple_edits && !mov->advanced_editlist)
            av_log(mov->fc, AV_LOG_WARNING, "multiple edit list entries, "
                   "Use -advanced_editlist to correctly decode otherwise "
                   "a/v desync might occur\n");

        /* adjust first dts according to edit list */
        if ((empty_duration || start_time) && mov->time_scale > 0) {
            if (empty_duration)
                empty_duration = av_rescale(empty_duration, sc->time_scale, mov->time_scale);
            sc->time_offset = start_time - empty_duration; //sample的开始偏移量
            sc->min_corrected_pts = start_time;
            if (!mov->advanced_editlist)
                current_dts = -sc->time_offset;//第一个dts为负数
        }

        if (!multiple_edits && !mov->advanced_editlist &&
            st->codecpar->codec_id == AV_CODEC_ID_AAC && start_time > 0)
            sc->start_pad = start_time;
    }

    /* only use old uncompressed audio chunk demuxing when stts specifies it */
    if (!(st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO &&
          sc->stts_count == 1 && sc->stts_data[0].duration == 1)) {
        unsigned int current_sample = 0; //当前sample序号
        unsigned int stts_sample = 0;
        unsigned int sample_size; //每个sample大小
        unsigned int distance = 0;
        unsigned int rap_group_index = 0;
        unsigned int rap_group_sample = 0;
        int64_t last_dts = 0; //上一个dts
        int64_t dts_correction = 0;
        int rap_group_present = sc->rap_group_count && sc->rap_group;
        int key_off = (sc->keyframe_count && sc->keyframes[0] > 0) || (sc->stps_count && sc->stps_data[0] > 0);

        current_dts -= sc->dts_shift;
        last_dts     = current_dts;//上一个sample的dts

        if (!sc->sample_count || st->internal->nb_index_entries)
            return;
        if (sc->sample_count >= UINT_MAX / sizeof(*st->internal->index_entries) - st->internal->nb_index_entries)
            return;
        //AVStream*st->internal->index_entries开辟内存，数量为sample总数，因为要确保在文件中可以读取每个sample
        if (av_reallocp_array(&st->internal->index_entries,
                              st->internal->nb_index_entries + sc->sample_count,
                              sizeof(*st->internal->index_entries)) < 0) {
            st->internal->nb_index_entries = 0;
            return;
        }
        st->internal->index_entries_allocated_size = (st->internal->nb_index_entries + sc->sample_count) * sizeof(*st->internal->index_entries);

        if (ctts_data_old) {
            // Expand ctts entries such that we have a 1-1 mapping with samples
            if (sc->sample_count >= UINT_MAX / sizeof(*sc->ctts_data))
                return;
            sc->ctts_count = 0;
            sc->ctts_allocated_size = 0;
            //ctts_data 重新开辟内存
            sc->ctts_data = av_fast_realloc(NULL, &sc->ctts_allocated_size,
                                    sc->sample_count * sizeof(*sc->ctts_data));
            if (!sc->ctts_data) {
                av_free(ctts_data_old);
                return;
            }

            memset((uint8_t*)(sc->ctts_data), 0, sc->ctts_allocated_size);

            //这里 ctts_count变成了sample数量，因为ctts_data->count会有大于等于2的情况
            //也就是从ctts box读出的ctts entry数量<=sample数量
            //为了使每个sample的dts都会被赋值，所以需要重新根据ctts_data->count，把ctts entry数量变成sample的数量
            //从而才可以计算出每个sample的pts
            for (i = 0; i < ctts_count_old &&
                        sc->ctts_count < sc->sample_count; i++)
                for (j = 0; j < ctts_data_old[i].count &&
                            sc->ctts_count < sc->sample_count; j++)
                    add_ctts_entry(&sc->ctts_data, &sc->ctts_count,
                                   &sc->ctts_allocated_size, 1,
                                   ctts_data_old[i].duration);
            av_free(ctts_data_old);
        }
        //chunk_count是chunk 的总数(一个chunk中有几个sample),通过stsc_data和chunk_count定位每一个sample
        for (i = 0; i < sc->chunk_count; i++) {
            int64_t next_offset = i+1 < sc->chunk_count ? sc->chunk_offsets[i+1] : INT64_MAX;
            current_offset = sc->chunk_offsets[i];//第i个chunk的相对文件头偏差值
            //判断第i个chunk是不是在stsc_data[stsc_index]的索引范围内，若不在索引范围内，跳到下一个索引
            while (mov_stsc_index_valid(stsc_index, sc->stsc_count) && 
                i + 1 == sc->stsc_data[stsc_index + 1].first)
                stsc_index++;

           ......
           //第i个chunk在stsc_index的范围内，每个chunk有多少sample
            for (j = 0; j < sc->stsc_data[stsc_index].count; j++) {
                int keyframe = 0;
                if (current_sample >= sc->sample_count) {
                    av_log(mov->fc, AV_LOG_ERROR, "wrong sample count\n");
                    return;
                }

                if (!sc->keyframe_absent && (!sc->keyframe_count || current_sample+key_off == sc->keyframes[stss_index])) {
                    keyframe = 1;
                    if (stss_index + 1 < sc->keyframe_count)
                        stss_index++;
                } else if (sc->stps_count && current_sample+key_off == sc->stps_data[stps_index]) {
                    keyframe = 1;
                    if (stps_index + 1 < sc->stps_count)
                        stps_index++;
                }
                ...
                if (sc->keyframe_absent
                    && !sc->stps_count
                    && !rap_group_present
                    && (st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO || (i==0 && j==0)))
                     keyframe = 1;
                if (keyframe)
                    distance = 0;
                sample_size = sc->stsz_sample_size > 0 ? sc->stsz_sample_size : sc->sample_sizes[current_sample];
                if (sc->pseudo_stream_id == -1 ||
                   sc->stsc_data[stsc_index].id - 1 == sc->pseudo_stream_id) {
                    AVIndexEntry *e;
                    if (sample_size > 0x3FFFFFFF) {
                        av_log(mov->fc, AV_LOG_ERROR, "Sample size %u is too large\n", sample_size);
                        return;
                    }
                    e = &st->internal->index_entries[st->internal->nb_index_entries++];
                    e->pos = current_offset;//sample相对文件头偏差值也就是整个文件的绝对位置
                    e->timestamp = current_dts;//sample的dts
                    e->size = sample_size;//每个sample大小
                    e->min_distance = distance;// min_distance==0 该sample为关键帧
                    e->flags = keyframe ? AVINDEX_KEYFRAME : 0;//是否是关键帧
                    av_log(mov->fc, AV_LOG_TRACE, "AVIndex stream %d, sample %u, offset %"PRIx64", dts %"PRId64", "
                            "size %u, distance %u, keyframe %d\n", st->index, current_sample,
                            current_offset, current_dts, sample_size, distance, keyframe);
                    if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO && st->internal->nb_index_entries < 100)
                        ff_rfps_add_frame(mov->fc, st, current_dts);
                }

                current_offset += sample_size;//下一个sample相对文件头偏差值
                stream_size += sample_size;//sample总体大小

                /* A negative sample duration is invalid based on the spec,
                 * but some samples need it to correct the DTS. */
                if (sc->stts_data[stts_index].duration < 0) {
                    av_log(mov->fc, AV_LOG_WARNING,
                           "Invalid SampleDelta %d in STTS, at %d st:%d\n",
                           sc->stts_data[stts_index].duration, stts_index,
                           st->index);
                    dts_correction += sc->stts_data[stts_index].duration - 1;
                    sc->stts_data[stts_index].duration = 1;
                }
                current_dts += sc->stts_data[stts_index].duration;//下一个sample的dts
                if (!dts_correction || current_dts + dts_correction > last_dts) {
                    current_dts += dts_correction;
                    dts_correction = 0;
                } else {
                    /* Avoid creating non-monotonous DTS */
                    dts_correction += current_dts - last_dts - 1;
                    current_dts = last_dts + 1;
                }
                last_dts = current_dts;//赋值给上一个sample的dts
                distance++;
                stts_sample++;
                current_sample++;
                if (stts_index + 1 < sc->stts_count && stts_sample == sc->stts_data[stts_index].count) {
                    stts_sample = 0;
                    stts_index++;
                }
            }
        }
        if (st->duration > 0)
           //计算码流（sample总体大小*8*时间缩放比例/dts总体时间）st->duration由stts得到
            st->codecpar->bit_rate = stream_size*8*sc->time_scale/st->duration;
    }...

    if (!mov->ignore_editlist && mov->advanced_editlist) {
        // Fix index according to edit lists.
        mov_fix_index(mov, st);
    }

    // Update start time of the stream.
    if (st->start_time == AV_NOPTS_VALUE && st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO && st->internal->nb_index_entries > 0) {
        st->start_time = st->internal->index_entries[0].timestamp + sc->dts_shift;
        if (sc->ctts_data) {
            st->start_time += sc->ctts_data[0].duration;
        }
    }

    mov_estimate_video_delay(mov, st);
}

mov_read_packet接口用来读取每一个sample的数据，通过trak box的sample metadata定位sample在文件中的位置，同时计算sample的pts，并将这些数据赋值给AVPacket，再用AVPacket的数据进行解码

static int mov_read_packet(AVFormatContext *s, AVPacket *pkt)
{
    MOVContext *mov = s->priv_data;
    MOVStreamContext *sc;
    AVIndexEntry *sample;
    AVStream *st = NULL;
    int64_t current_index;
    int ret;
    mov->fc = s;
 retry:
   //通过current_sample序号在AVStream->internal->index_entries找出sample信息AVIndexEntry（位置，大小等）
    sample = mov_find_next_sample(s, &st);
...
    sc = st->priv_data;
    /* must be done just before reading, to avoid infinite loop on sample */
    current_index = sc->current_index;
    //增加current_sample序号（序号+1）
    mov_current_sample_inc(sc);
...
        if (st->discard == AVDISCARD_NONKEY && !(sample->flags & AVINDEX_KEYFRAME)) {
            av_log(mov->fc, AV_LOG_DEBUG, "Nonkey frame from stream %d discarded due to AVDISCARD_NONKEY\n", sc->ffindex);
            goto retry;
        }

        if (st->codecpar->codec_id == AV_CODEC_ID_EIA_608 && sample->size > 8)
            ret = get_eia608_packet(sc->pb, pkt, sample->size);
        else
        //根据sample信息AVIndexEntry（大小），通过avio_read接口在文件读取sample->size大小的数据到AVPacket中
            ret = av_get_packet(sc->pb, pkt, sample->size);
        if (ret < 0) {
            if (should_retry(sc->pb, ret)) {
                mov_current_sample_dec(sc);
            }
            return ret;
        }
...
        if (sc->has_palette) {//Quick Time格式需要
...
    }

    pkt->stream_index = sc->ffindex;//stream序号
    pkt->dts = sample->timestamp;//给AVPacket赋值sample的dts
    if (sample->flags & AVINDEX_DISCARD_FRAME) {
        pkt->flags |= AV_PKT_FLAG_DISCARD;
    }
    
    //计算sample的pts，pts=dts + ctts_data[i].duration（ctts中的duration）
    //要是没有ctts,pts==dts
    if (sc->ctts_data && sc->ctts_index < sc->ctts_count) {
        pkt->pts = pkt->dts + sc->dts_shift + sc->ctts_data[sc->ctts_index].duration;
        /* update ctts context */
        sc->ctts_sample++;
        if (sc->ctts_index < sc->ctts_count &&
            sc->ctts_data[sc->ctts_index].count == sc->ctts_sample) {
            sc->ctts_index++;
            sc->ctts_sample = 0;
        }
    } else {
        int64_t next_dts = (sc->current_sample < st->internal->nb_index_entries) ?
            st->internal->index_entries[sc->current_sample].timestamp : st->duration;

        if (next_dts >= pkt->dts)
            pkt->duration = next_dts - pkt->dts;
        pkt->pts = pkt->dts;
    }
    if (st->discard == AVDISCARD_ALL)
        goto retry;
    if (sc->sdtp_data && sc->current_sample <= sc->sdtp_count) {
        uint8_t sample_flags = sc->sdtp_data[sc->current_sample - 1];
        uint8_t sample_is_depended_on = (sample_flags >> 2) & 0x3;
        pkt->flags |= sample_is_depended_on == MOV_SAMPLE_DEPENDENCY_NO ? AV_PKT_FLAG_DISPOSABLE : 0;
    }
    pkt->flags |= sample->flags & AVINDEX_KEYFRAME ? AV_PKT_FLAG_KEY : 0;
    pkt->pos = sample->pos;//给AVPacket赋值sample的位置

    /* Multiple stsd handling. */
    if (sc->stsc_data) {
        /* Keep track of the stsc index for the given sample, then check
        * if the stsd index is different from the last used one. */
        sc->stsc_sample++;
        if (mov_stsc_index_valid(sc->stsc_index, sc->stsc_count) &&
            mov_get_stsc_samples(sc, sc->stsc_index) == sc->stsc_sample) {
            sc->stsc_index++;
            sc->stsc_sample = 0;
        /* Do not check indexes after a switch. */
        } else if (sc->stsc_data[sc->stsc_index].id > 0 &&
                   sc->stsc_data[sc->stsc_index].id - 1 < sc->stsd_count &&
                   sc->stsc_data[sc->stsc_index].id - 1 != sc->last_stsd_index) {
            ret = mov_change_extradata(sc, pkt);
            if (ret < 0)
                return ret;
        }
    }

    if (mov->aax_mode)
        aax_filter(pkt->data, pkt->size, mov);

    ret = cenc_filter(mov, st, sc, pkt, current_index);
    if (ret < 0) {
        return ret;
    }

    return 0;
}

//avst->internal->index_entrie在上面mov_build_index接口开过内存并赋值，请看上面接口
//每次current_sample都会+1, dts在这里的计算用不上，后面会重新计算，这里主要是找出sample信息
static AVIndexEntry *mov_find_next_sample(AVFormatContext *s, AVStream **st)
{
    AVIndexEntry *sample = NULL;
    int64_t best_dts = INT64_MAX;
    int i;
    for (i = 0; i < s->nb_streams; i++) {
        AVStream *avst = s->streams[i];
        MOVStreamContext *msc = avst->priv_data;
        //每次current_sample都会+1, dts在这里的计算用不上，后面会重新计算，这里主要是找出sample信息
        if (msc->pb && msc->current_sample < avst->internal->nb_index_entries) {
            AVIndexEntry *current_sample = &avst->internal->index_entries[msc->current_sample];
            int64_t dts = av_rescale(current_sample->timestamp, AV_TIME_BASE, msc->time_scale);
            av_log(s, AV_LOG_TRACE, "stream %d, sample %d, dts %"PRId64"\n", i, msc->current_sample, dts);
            if (!sample || (!(s->pb->seekable & AVIO_SEEKABLE_NORMAL) && current_sample->pos < sample->pos) ||
                ((s->pb->seekable & AVIO_SEEKABLE_NORMAL) &&
                 ((msc->pb != s->pb && dts < best_dts) || (msc->pb == s->pb && dts != AV_NOPTS_VALUE &&
                 ((FFABS(best_dts - dts) <= AV_TIME_BASE && current_sample->pos < sample->pos) ||
                  (FFABS(best_dts - dts) > AV_TIME_BASE && dts < best_dts)))))) {
                sample = current_sample;
                best_dts = dts;
                *st = avst;
            }
        }
    }
    return sample;
}

mov_read_header、mov_read_packet、mov_read_trak重要接口和数据结构都详细介绍完了，基本是看一遍就知道怎么回事，写起来超麻烦的，其中trak box最重要内容请看ffmpeg解封装mov/mp4格式解封装源码分析之trak box(atom)模块mov_read_stsd(stts)、(stss)、(ctts)、(stsc)、(stsz)、(stco)接口分析

一个大龄程序员

关注

4
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
ffmpeg解封装mov/mp4格式解封装源码分析之mov_read_header(读取metadata)、mov_read_packet(读取sample数据)、mov_read_trak

最近在研究ffmpeg，发现网上关于ffmpeg解封装的源码分析不多而且不全，所以这里总结一下，我自己对ffmpeg解封装mov、mp4格式的源码分析主要就是关于mov.c的源码分析，让各位同学了解一下，mp4的流AVStream、AVPacket是如何赋值的，这两个结构体变量会是解码的输入数据，了解解封装过程，如pts、dts是如何得到的，有助于ffmpeg的二次开发。关于MP4协议，网上有详细的描述，我这里就不在写了，直接上干货，代码书注释。重点小Tips:ffmpeg中很多结构体(AVStre
复制链接

扫一扫

专栏目录