视频的显示和存放原理
对于一个电影,帧是这样来显示的:I B B P。现在我们需要在显示B帧之前知道P帧中的信息。因此,帧可能会按照这样的方式来存储:IPBB。这就是为什么我们会有一个解码时间戳和一个显示时间戳的原因。解码时间戳告诉我们什么时候需要解码,显示时间戳告诉我们什么时候需要显示。所以,在这种情况下,我们的流可以是这样的:
PTS: 1 4 2 3
DTS: 1 2 3 4
Stream: I P B B
通常PTS和DTS只有在流中有B帧的时候会不同。
ffmpeg中的时间单位
AV_TIME_BASE
ffmpeg中的内部计时单位(时间基),ffmepg中的所有时间都是于它为一个单位,比如AVStream中的duration即以为着这个流的长度为duration个AV_TIME_BASE。AV_TIME_BASE定义为:
#define AV_TIME_BASE 1000000
AV_TIME_BASE_Q
ffmpeg内部时间基的分数表示,实际上它是AV_TIME_BASE的倒数。从它的定义能很清楚的看到这点:
#define AV_TIME_BASE_Q (AVRational){1, AV_TIME_BASE}
AVRatioal的定义如下:
typedef struct AVRational{
int num; //numerator
int den; //denominator
} AVRational;
ffmpeg提供了一个把AVRatioal结构转换成double的函数:
static inline double av_q2d(AVRational a){
/**
* Convert rational to double.
* @param a rational to convert
**/
return a.num / (double) a.den;
}
现在可以根据pts来计算一桢在整个视频中的时间位置:
timestamp(秒) = pts * av_q2d(st->time_base)
计算视频长度的方法:
time(秒) = st->duration * av_q2d(st->time_base)
这里的st是一个AVStream对象指针。
ffmpeg 中的时间基
ffmpeg存在多个时间基准(time_base),对应不同的阶段(结构体),每个time_base具体的值不一样,ffmpeg提供函数在各个time_base中进行切换。搞清楚各个time_base的来源,对于阅读ffmpeg的代码很重要。
AVStream(libavformat/avformat.h)
typedef struct AVStream {
/**
* This is the fundamental unit of time (in seconds) in terms
* of which frame timestamps are represented.
*
* decoding: set by libavformat
* encoding: May be set by the caller before avformat_write_header() to
* provide a hint to the muxer about the desired timebase. In
* avformat_write_header(), the muxer will overwrite this field
* with the timebase that will actually be used for the timestamps
* written into the file (which may or may not be related to the
* user-provided one, depending on the format).
*/
AVRational time_base;
/**
* Decoding: pts of the first frame of the stream in presentation order, in stream time base.
* Only set this if you are absolutely 100% sure that the value you set
* it to really is the pts of the first frame.
* This may be undefined (AV_NOPTS_VALUE).
* @note The ASF header does NOT contain a correct start_time the ASF
* demuxer must NOT set this.
*/
int64_t start_time;
/**
* Decoding: duration of the stream, in stream time base.
* If a source file does not specify a duration, but does specify
* a bitrate, this value will be estimated from bitrate and file size.
*/
int64_t duration;
从上面的信息可以看到,AVStream->time_base单位为秒。 那AVStream->time_base具体的值是多少呢?下面以mpegts_demuxer为例:
static int mpegts_set_stream_info(AVStream *st, PESContext *pes,
uint32_t stream_type, uint32_t prog_reg_desc)
{
avpriv_set_pts_info(st, 33, 1, 90000);
void avpriv_set_pts_info(AVStream *s, int pts_wrap_bits,
unsigned int pts_num, unsigned int pts_den)
{
AVRational new_tb;
if (av_reduce(&new_tb.num, &new_tb.den, pts_num, pts_den, INT_MAX)) {
if (new_tb.num != pts_num)
av_log(NULL, AV_LOG_DEBUG,
"st:%d removing common factor %d from timebase\n",
s->index, pts_num / new_tb.num);
} else
av_log(NULL, AV_LOG_WARNING,
"st:%d has too large timebase, reducing\n", s->index);
if (new_tb.num <= 0 || new_tb.den <= 0) {
av_log(NULL, AV_LOG_ERROR,
"Ignoring attempt to set invalid timebase %d/%d for st:%d\n",
new_tb.num, new_tb.den,
s->index);
return;
}
s->time_base = new_tb;
av_codec_set_pkt_timebase(s->codec, new_tb);
s->pts_wrap_bits = pts_wrap_bits;
}
通过avpriv_set_pts_info(st, 33, 1, 90000)函数,设置AVStream->time_base为1/90000。为什么是90000?因为mpeg的pts、dts都是以90kHz来采样的,所以采样间隔为1/90000秒。
AVCodecContext
typedef struct AVCodecContext {
/**
* This is the fundamental unit of time (in seconds) in terms
* of which frame timestamps are represented. For fixed-fps content,
* timebase should be 1/framerate and timestamp increments should be
* identically 1.
* - encoding: MUST be set by user.
* - decoding: Set by libavcodec.
*/
AVRational time_base;
从上面的信息可以看到,AVCodecContext->time_base单位同样为秒,不过精度没有AVStream->time_base高,大小为1/framerate。 下面以ffmpeg转码工具为例:
static int transcode_init(void)
{
if (enc_ctx->codec_type == AVMEDIA_TYPE_VIDEO) {
if (ost->filter && !ost->frame_rate.num)
ost->frame_rate = av_buffersink_get_frame_rate(ost->filter->filter);
if (ist && !ost->frame_rate.num)
ost->frame_rate = ist->framerate;
if (ist && !ost->frame_rate.num)
ost->frame_rate = ist->st->r_frame_rate;
if (ist && !ost->frame_rate.num) {
ost->frame_rate = (AVRational){25, 1};
av_log(NULL, AV_LOG_WARNING,
"No information "
"about the input framerate is available. Falling "
"back to a default value of 25fps for output stream #%d:%d. Use the -r option "
"if you want a different framerate.\n",
ost->file_index, ost->index);
}
// ost->frame_rate = ist->st->avg_frame_rate.num ? ist->st->avg_frame_rate : (AVRational){25, 1};
if (ost->enc && ost->enc->supported_framerates && !ost->force_fps) {
int idx = av_find_nearest_q_idx(ost->frame_rate, ost->enc->supported_framerates);
ost->frame_rate = ost->enc->supported_framerates[idx];
}
if (enc_ctx->codec_id == AV_CODEC_ID_MPEG4) {
av_reduce(&ost->frame_rate.num, &ost->frame_rate.den,
ost->frame_rate.num, ost->frame_rate.den, 65535);
}
}
switch (enc_ctx->codec_type) {
case AVMEDIA_TYPE_VIDEO:
enc_ctx->time_base = av_inv_q(ost->frame_rate);
if (ost->filter && !(enc_ctx->time_base.num && enc_ctx->time_base.den))
enc_ctx->time_base = ost->filter->filter->inputs[0]->time_base;
if ( av_q2d(enc_ctx->time_base) < 0.001 && video_sync_method != VSYNC_PASSTHROUGH
&& (video_sync_method == VSYNC_CFR || video_sync_method == VSYNC_VSCFR || (video_sync_method == VSYNC_AUTO && !(oc->oformat->flags & AVFMT_VARIABLE_FPS)))){
av_log(oc, AV_LOG_WARNING, "Frame rate very high for a muxer not efficiently supporting it.\n"
"Please consider specifying a lower framerate, a different muxer or -vsync 2\n");
}
首先获取ost->frame_rate,然后计算enc_ctx->time_base = 1/ost->frame_rate。
总结:AVStream->time_base比AVCodecContext->time_base精度要高(数值要小),比如AVStream->time_base为1/90000,而AVCodecContext->time_base为1/30(假设frame_rate为30);同样的pts和dts,以AVStream->time_base为单位,数值要比以AVCodecContext->time_base为单位要大。
DTS和PTS
音频和视频流都有一些关于以多快速度和什么时间来播放它们的信息在里面。音频流有采样,视频流有每秒的帧率。然而,如果我们只是简单的通过数帧和乘以帧率的方式来同步视频,那么就很有可能会失去同步。于是作为一种补充,在流中的包有种叫做DTS(解码时间戳)和PTS(显示时间戳)的机制。为了这两个参数,你需要了解电影存放的方式。像MPEG等格式,使用被叫做B帧(B表示双向bidrectional)的方式。另外两种帧被叫做I帧和P帧(I表示关键帧,P表示预测帧)。I帧包含了某个特定的完整图像。P帧依赖于前面的I帧和P帧并且使用比较或者差分的方式来编码。B帧与P帧有点类似,但是它是依赖于前面和后面的帧的信息的。这也就解释了为什么我们可能在调用avcodec_decode_video以后会得不到一帧图像。
各个结构下,pts和dts使用哪个time_base来表示呢?
AVPacket
typedef struct AVPacket {
/**
* Presentation timestamp in AVStream->time_base units; the time at which
* the decompressed packet will be presented to the user.
* Can be AV_NOPTS_VALUE if it is not stored in the file.
* pts MUST be larger or equal to dts as presentation cannot happen before
* decompression, unless one wants to view hex dumps. Some formats misuse
* the terms dts and pts/cts to mean something different. Such timestamps
* must be converted to true pts/dts before they are stored in AVPacket.
*/
int64_t pts;
/**
* Decompression timestamp in AVStream->time_base units; the time at which
* the packet is decompressed.
* Can be AV_NOPTS_VALUE if it is not stored in the file.
*/
int64_t dts;
从上面可以看到,AVPacket下的pts和dts以AVStream->time_base为单位(数值比较大)。这也很容易理解,根据mpeg的协议,压缩后或解压前的数据,pts和dts是90kHz时钟的采样值,时间间隔就是AVStream->time_base。
AVFrame
typedef struct AVFrame {
/**
* Presentation timestamp in time_base units (time when frame should be shown to user).
*/
int64_t pts;
/**
* PTS copied from the AVPacket that was decoded to produce this frame.
*/
int64_t pkt_pts;
/**
* DTS copied from the AVPacket that triggered returning this frame. (if frame threading isn't used)
* This is also the Presentation time of this AVFrame calculated from
* only AVPacket.dts values without pts values.
*/
int64_t pkt_dts;
AVFrame里面的pkt_pts和pkt_dts是拷贝自AVPacket,同样以AVStream->time_base为单位;而pts是为输出(显示)准备的,以AVCodecContex->time_base为单位)。//FIXME
InputStream
typedef struct InputStream {
int file_index;
AVStream *st;
AVCodecContext *dec_ctx;
int64_t start; /* time when read started */
/* predicted dts of the next packet read for this stream or (when there are
* several frames in a packet) of the next frame in current packet (in AV_TIME_BASE units) */
int64_t next_dts;
int64_t dts; ///< dts of the last packet read for this stream (in AV_TIME_BASE units)
int64_t next_pts; ///< synthetic pts for the next decode frame (in AV_TIME_BASE units)
int64_t pts; ///< current pts of the decoded frame (in AV_TIME_BASE units)
从上面可以看到,InputStream下的pts和dts以AV_TIME_BASE为单位(微秒),至于为什么要转化为微妙,可能是为了避免使用浮点数。
OutputStream
typedef struct OutputStream {
int file_index; /* file index */
int index; /* stream index in the output file */
int source_index; /* InputStream index */
AVStream *st; /* stream in the output file */
int encoding_needed; /* true if encoding needed for this stream */
int frame_number;
/* input pts and corresponding output pts
for A/V sync */
struct InputStream *sync_ist; /* input stream to sync against */
int64_t sync_opts; /* output frame counter, could be changed to some true timestamp */ // FIXME look at frame_number
/* pts of the first frame encoded for this stream, used for limiting
* recording time */
int64_t first_pts;
/* dts of the last packet sent to the muxer */
int64_t last_mux_dts;
AVBitStreamFilterContext *bitstream_filters;
AVCodecContext *enc_ctx;
AVCodec *enc;
int64_t max_frames;
AVFrame *filtered_frame;
OutputStream涉及音视频同步,结构和InputStream不同,暂时只作记录,不分析。
时间基转换公式
- timestamp(ffmpeg内部时间戳) = AV_TIME_BASE * time(秒)
- time(秒) = AV_TIME_BASE_Q * timestamp(ffmpeg内部时间戳)
所以当需要把视频跳转到N秒的时候可以使用下面的方法:
int64_t timestamp = N * AV_TIME_BASE;
av_seek_frame(fmtctx, index_of_video, timestamp, AVSEEK_FLAG_BACKWARD);
ffmpeg同样为我们提供了不同时间基之间的转换函数:
int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq)
这个函数的作用是计算a * bq / cq,来把时间戳从一个时基调整到另外一个时基。在进行时基转换的时候,我们应该首选这个函数,因为它可以避免溢出的情况发生。
example
InputStream(AV_TIME_BASE)到AVPacket(AVStream->time_base)
static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output)
{
pkt->dts = av_rescale_q(ist->dts, AV_TIME_BASE_Q, ist->st->time_base);
AVPacket(AVStream->time_base)到InputStream(AV_TIME_BASE)
static int process_input_packet(InputStream *ist, const AVPacket *pkt)
{
if (pkt->dts != AV_NOPTS_VALUE) {
ist->next_dts = ist->dts = av_rescale_q(pkt->dts, ist->st->time_base, AV_TIME_BASE_Q);
TBR TBN TBC
TBR: tbr is guessed from the video stream and is the value users want to see when they look for the video frame rate,代表帧率
TBN:the time base in AVStream that has come from the Container,代表文件层(st)的时间精度,即1S=1200k,和duration相关
TBC: the time base in AVCodecContext for the codec used for a particular stream,代表视频层(st->codec)的时间精度,即1S=50,和strem->duration和时间戳相关。