系列文章目录
【x264编码器】章节1——x264编码流程及基于x264的编码器demo
【x264编码器】章节2——x264的lookahead流程分析
【x265编码器】章节2——编码流程及基于x265的编码器demo
目录
2.P帧16x16帧间预测mb_analyse_inter_p16x16
3.预测16x16宏块的运动矢量x264_mb_predict_mv_16x16
4.获取预测运动矢量x264_mb_predict_mv_ref16x16
一、帧间预测流程
x264帧间预测流程总体流程如下:
对P帧而言,每个16x16宏块支持以下四种分割:
1.一个16x16宏块,对应mb_analyse_inter_p16x16;
2.两个16x8宏块,对应mb_analyse_inter_p16x8;
3.两个8x16宏块,对应mb_analyse_inter_p8x16;
4.四个8x8宏块,对应mb_analyse_inter_p8x8;
并且每个8x8块也可以支持4种分割方式:
1.一个8x8块,对应mb_analyse_inter_p8x8;
2.两个8x4块,对应mb_analyse_inter_p8x4;
3.两个4x8块,对应mb_analyse_inter_p4x8;
4.四个4x4块,对应mb_analyse_inter_p4x4;
对于B帧而言,每个16x16宏块也有4种分割方式:
1.一个16x16宏块,对应mb_analyse_inter_b16x16;
2.两个16x8宏块,对应mb_analyse_inter_b16x8;
3.两个8x16宏块,对应mb_analyse_inter_b8x16;
4.四个8x8宏块,对应mb_analyse_inter_b8x8;
二、代码模块分析
1.宏块分析x264_macroblock_analyse
代码如下:
/*****************************************************************************
* x264_macroblock_analyse:
*****************************************************************************/
void x264_macroblock_analyse( x264_t *h )
{
x264_mb_analysis_t analysis;
int i_cost = COST_MAX;
h->mb.i_qp = x264_ratecontrol_mb_qp( h );
/* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
* to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )//检查当前宏块的QP值与上一个宏块的QP值之差是否为1,并将QP值设置为上一个宏块的QP值,以降低qp_delta的比特成本。这个操作的目的是在相邻的宏块中尽量减少QP的变化,以减少qp_delta的编码开销
h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
if( h->param.analyse.b_mb_info )
h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
mb_analyse_init( h, &analysis, h->mb.i_qp );//调用mb_analyse_init函数进行宏块分析的初始化,并传入刚计算得到的QP值
/*--------------------------- Do the analysis ---------------------------*/
if( h->sh.i_type == SLICE_TYPE_I )//帧内预测类型
{
intra_analysis://进入intra_analysis标签处
if( analysis.i_mbrd )//如果宏块分析结果中的i_mbrd值大于0,调用mb_init_fenc_cache函数初始化帧编码缓存(根据i_mbrd的值来决定是否初始化帧缓存)
mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
mb_analyse_intra( h, &analysis, COST_MAX );//调用mb_analyse_intra函数进行帧内宏块分析
if( analysis.i_mbrd )//如果宏块分析结果中的i_mbrd值大于0,调用intra_rd函数进行帧内宏块的重建与决策
intra_rd( h, &analysis, COST_MAX );
//计算SATD值,并根据不同的SATD值选择合适的帧内预测模式(I_16x16、I_4x4、I_8x8或I_PCM)
i_cost = analysis.i_satd_i16x16;
h->mb.i_type = I_16x16;
COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
if( analysis.i_satd_pcm < i_cost )
h->mb.i_type = I_PCM;
//如果宏块分析结果中的i_mbrd值大于等于2,执行intra_rd_refine函数进行帧内宏块的细化
else if( analysis.i_mbrd >= 2 )
intra_rd_refine( h, &analysis );
}
else if( h->sh.i_type == SLICE_TYPE_P )
{ //声明一个变量b_skip用于标记当前宏块是否需要跳过
int b_skip = 0;
//进行参考帧的预取操作
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
analysis.b_try_skip = 0;
if( analysis.b_force_intra )//表示需要强制使用帧内预测
{ //参数配置选择是否使用心理视觉优化
if( !h->param.analyse.b_psy )
{ //然后初始化宏块的QP值,并跳转到intra_analysis进行帧内预测分析
mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
goto intra_analysis;
}
}
else//如果analysis.b_force_intra为假,表示不需要强制使用帧内预测。这时会根据一些特殊的快速跳过逻辑以及宏块信息进行判断
{
/* Special fast-skip logic using information from mb_info. */
if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
{ //如果不是SLICE_MBAFF(宏块自适应帧场)模式,并且参考帧之间的帧号差为1,并且没有加权预测(weighted prediction),并且参考帧的有效QP值小于等于当前宏块的QP值,那么可以直接跳过当前宏块
if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
{
h->mb.i_partition = D_16x16;
/* Use the P-SKIP MV if we can... */
if( !M32(h->mb.cache.pskip_mv) )
{
b_skip = 1;
h->mb.i_type = P_SKIP;
}
/* Otherwise, just force a 16x16 block. */
else
{
h->mb.i_type = P_L0;
analysis.l0.me16x16.i_ref = 0;
M32( analysis.l0.me16x16.mv ) = 0;
}
goto skip_analysis;
}//否则,如果配置允许更新宏块信息(h->param.analyse.b_mb_info_update为真),则将当前宏块的X264_MBINFO_CONSTANT标志位清除
/* Reset the information accordingly */
else if( h->param.analyse.b_mb_info_update )
h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
}
//接下来,判断当前宏块是否超出了帧的范围,如果是,则将b_skip标记为1,表示跳过当前宏块
int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
/* If the current macroblock is off the frame, just skip it. */
if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
b_skip = 1;
/* Fast P_SKIP detection *///如果启用了快速P_SKIP检测(h->param.analyse.b_fast_pskip为真),则根据一些条件进行判断是否可以跳过当前宏块
else if( h->param.analyse.b_fast_pskip )
{
if( skip_invalid )
{}
else if( h->param.analyse.i_subpel_refine >= 3 )
analysis.b_try_skip = 1;
else if( h->mb.i_mb_type_left[0] == P_SKIP ||
h->mb.i_mb_type_top == P_SKIP ||
h->mb.i_mb_type_topleft == P_SKIP ||
h->mb.i_mb_type_topright == P_SKIP )
b_skip = x264_macroblock_probe_pskip( h );//否则,根据当前宏块左侧、上方、左上方、右上方的宏块类型是否为P_SKIP来判断是否可以跳过当前宏块,具体判断逻辑在x264_macroblock_probe_pskip函数中
}
}
//进行参考帧的预取操作
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
if( b_skip )
{ //如果b_skip为真,表示可以跳过预测,将宏块类型设置为P_SKIP,分区类型设置为D_16x16,并设置相关的运动矢量
h->mb.i_type = P_SKIP;
h->mb.i_partition = D_16x16;
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
skip_analysis:
/* Set up MVs for future predictors */
for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
}
else
{ //如果b_skip为假,表示不能跳过预测,则继续执行帧间预测的相关操作
const unsigned int flags = h->param.analyse.inter;
int i_type;
int i_partition;
int i_satd_inter, i_satd_intra;
//加载预测分析的相关成本信息
mb_analyse_load_costs( h, &analysis );
//接下来,根据预测分析进行帧间16x16预测
mb_analyse_inter_p16x16( h, &analysis );
//如果当前宏块的类型为P_SKIP,表示可以跳过当前宏块的预测
if( h->mb.i_type == P_SKIP )
{
for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
return;
}
//接下来,根据参数配置判断是否需要进行P帧8x8预测,并进行相应的预测分析
if( flags & X264_ANALYSE_PSUB16x16 )
{
if( h->param.analyse.b_mixed_references )
mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
else
mb_analyse_inter_p8x8( h, &analysis );
}
//根据预测分析的结果选择最佳的帧间预测模式
/* Select best inter mode */
i_type = P_L0;
i_partition = D_16x16;
i_cost = analysis.l0.me16x16.cost;
//如果允许8x8预测,并且不早停(analysis.b_early_terminate为假),或者8x8预测的成本比16x16预测更低,那么选择P_8x8作为预测模式,D_8x8作为分区类型
if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
{
i_type = P_8x8;
i_partition = D_8x8;
i_cost = analysis.l0.i_cost8x8;
/* Do sub 8x8 *///如果允许进行8x8子块预测(flags & X264_ANALYSE_PSUB8x8为真),则对每个4x4子块进行预测分析,并选择最佳的子块预测模式
if( flags & X264_ANALYSE_PSUB8x8 )
{
for( int i = 0; i < 4; i++ )//对每个4x4子块进行预测分析,并选择最佳的子块预测模式
{ //首先对每个4x4子块进行预测分析,然后计算阈值i_thresh8x4,该阈值是该子块的两个8x4子块的运动矢量成本之和
mb_analyse_inter_p4x4( h, &analysis, i );
int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
{ //如果不早停(!analysis.b_early_terminate为真),或者4x4子块的成本比8x8预测的成本加上阈值要低,则选择4x4子块预测
int i_cost8x8 = analysis.l0.i_cost4x4[i];
h->mb.i_sub_partition[i] = D_L0_4x4;
//据选择的子块预测模式,进行相应的预测分析,并更新宏块的分区类型和成本
mb_analyse_inter_p8x4( h, &analysis, i );
COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
h->mb.i_sub_partition[i], D_L0_8x4 );
mb_analyse_inter_p4x8( h, &analysis, i );
COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
h->mb.i_sub_partition[i], D_L0_4x8 );
i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
}//调用mb_cache_mv_p8x8函数,将预测分析的运动矢量缓存到宏块的相应位置
mb_cache_mv_p8x8( h, &analysis, i );
}
analysis.l0.i_cost8x8 = i_cost;
}
}
/* Now do 16x8/8x16 *///在之前的分析基础上,进行16x8和8x16子块的预测分析,并选择最佳的子块预测模式
int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;//计算阈值i_thresh16x8,该阈值是两个8x8子块的运动矢量成本之和
if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )//通过比较阈值和8x8子块的成本,判断是否选择16x8预测模式
{
int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
+ analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
//进行16x8子块的预测分析,并更新宏块的类型和分区类型
mb_analyse_inter_p16x8( h, &analysis, i_cost );
COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
//计算另一个8x16子块预测模式的成本估计值analysis.i_cost_est8x16[1],该值是两个8x8子块的SATD成本之和,再加上平均运动矢量和参考帧成本的一半
i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
+ analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
//进行8x16子块的预测分析,并更新宏块的类型和分区类型
mb_analyse_inter_p8x16( h, &analysis, i_cost );
COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
}
//将当前宏块的分区类型保存到h->mb.i_partition中
h->mb.i_partition = i_partition;
/* refine qpel */
//FIXME mb_type costs?
if( analysis.i_mbrd || !h->mb.i_subpel_refine )
{//然后,根据分区类型进行子像素精化。如果analysis.i_mbrd不为零或者h->mb.i_subpel_refine为假,则不进行精化,留待后续处理
/* refine later */
}
else if( i_partition == D_16x16 )
{ //如果分区类型为D_16x16,调用x264_me_refine_qpel函数对16x16子块的运动估计进行子像素精化,并将成本赋值给i_cost
x264_me_refine_qpel( h, &analysis.l0.me16x16 );
i_cost = analysis.l0.me16x16.cost;
}
else if( i_partition == D_16x8 )
{ //如果分区类型为D_16x8,调用x264_me_refine_qpel函数对两个16x8子块的运动估计进行子像素精化,并将两个子块的成本相加赋值给i_cost
x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
}
else if( i_partition == D_8x16 )
{ //如果分区类型为D_8x16,调用x264_me_refine_qpel函数对两个8x16子块的运动估计进行子像素精化,并将两个子块的成本相加赋值给i_cost
x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
}
else if( i_partition == D_8x8 )
{ //如果分区类型为D_8x8,通过循环遍历4个8x8子块,根据子分区类型进行相应的子像素精化,并将各个子块的成本累加到i_cost中
i_cost = 0;
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
{
switch( h->mb.i_sub_partition[i8x8] )
{
case D_L0_8x8:
x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
i_cost += analysis.l0.me8x8[i8x8].cost;
break;
case D_L0_8x4:
x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
i_cost += analysis.l0.me8x4[i8x8][0].cost +
analysis.l0.me8x4[i8x8][1].cost;
break;
case D_L0_4x8:
x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
i_cost += analysis.l0.me4x8[i8x8][0].cost +
analysis.l0.me4x8[i8x8][1].cost;
break;
case D_L0_4x4:
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
i_cost += analysis.l0.me4x4[i8x8][0].cost +
analysis.l0.me4x4[i8x8][1].cost +
analysis.l0.me4x4[i8x8][2].cost +
analysis.l0.me4x4[i8x8][3].cost;
break;
default:
x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
break;
}
}
}
if( h->mb.b_chroma_me )//检查h->mb.b_chroma_me是否为真,如果是,则进行色度运动估计
{
if( CHROMA444 )
{
mb_analyse_intra( h, &analysis, i_cost );
mb_analyse_intra_chroma( h, &analysis );
}
else
{ //先调用mb_analyse_intra_chroma函数对色度分量进行帧内预测分析,然后调用mb_analyse_intra函数对宏块进行帧内预测分析,其中成本减去了analysis.i_satd_chroma
mb_analyse_intra_chroma( h, &analysis );
mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
}//将色度分量的analysis.i_satd_chroma累加到analysis.i_satd_i16x16、analysis.i_satd_i8x8和analysis.i_satd_i4x4中
analysis.i_satd_i16x16 += analysis.i_satd_chroma;
analysis.i_satd_i8x8 += analysis.i_satd_chroma;
analysis.i_satd_i4x4 += analysis.i_satd_chroma;
}
else//则只进行宏块的帧内预测分析,调用mb_analyse_intra函数
mb_analyse_intra( h, &analysis, i_cost );
//通过比较analysis.i_satd_i16x16、analysis.i_satd_i8x8和analysis.i_satd_i4x4的值,选取最小的一个作为i_satd_intra
i_satd_inter = i_cost;
i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
analysis.i_satd_i8x8,
analysis.i_satd_i4x4 );
if( analysis.i_mbrd )
{ //调用mb_analyse_p_rd函数对宏块进行P帧预测分析,传递的成本参数为i_satd_inter和i_satd_intra中的较小值
mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
i_type = P_L0;
i_partition = D_16x16;
i_cost = analysis.l0.i_rd16x16;//调用mb_analyse_p_rd函数对宏块进行P帧预测分析,传递的成本参数为i_satd_inter和i_satd_intra中的较小值
COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
h->mb.i_type = i_type;
h->mb.i_partition = i_partition;
if( i_cost < COST_MAX )//如果i_cost小于COST_MAX,调用mb_analyse_transform_rd函数对宏块进行变换与量化的预测分析,更新i_satd_inter和i_cost
mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );//调用intra_rd函数对宏块进行帧内预测的重建决策,传递的成本参数为i_satd_inter的5/4倍加1
}
//接下来,通过比较i_cost和各个帧内预测模式的成本,选取最小的一个模式作为i_type,更新h->mb.i_type
COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
h->mb.i_type = i_type;
//首先检查analysis.b_force_intra是否为真,并且当前宏块的类型i_type不是帧内预测类型
if( analysis.b_force_intra && !IS_INTRA(i_type) )
{
/* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
* it was an inter block. */
analyse_update_cache( h, &analysis );//调用analyse_update_cache函数更新缓存
x264_macroblock_encode( h );//调用x264_macroblock_encode函数对宏块进行编码
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )//将帧解码图像拷贝到帧编码图像中,使其看起来像是一个帧间预测块
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
if( !CHROMA444 )
{ //如果色度采样格式不是CHROMA444,还会将色度分量进行相应的拷贝
int height = 16 >> CHROMA_V_SHIFT;
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
}//调用mb_analyse_init_qp函数初始化宏块的QP值
mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
goto intra_analysis;//跳转到intra_analysis标签执行后续操作
}
//检查analysis.i_mbrd是否大于等于2,并且当前宏块的类型不是I_PCM类型
if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
{
if( IS_INTRA( h->mb.i_type ) )
{ //如果当前宏块类型是帧内预测类型,调用intra_rd_refine函数对宏块进行帧内预测的细化
intra_rd_refine( h, &analysis );
}
else if( i_partition == D_16x16 )
{ //如果分区类型是D_16x16,调用x264_me_refine_qpel_rd函数对16x16块进行亚像素运动估计的细化
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
analysis.l0.me16x16.cost = i_cost;
x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
}
else if( i_partition == D_16x8 )
{ //如果分区类型是D_16x8,先设置宏块的子分区模式,然后分别对两个8x8子块进行亚像素运动估计的细化
M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
}
else if( i_partition == D_8x16 )
{ //如果分区类型是D_8x16,先设置宏块的子分区模式,然后分别对两个8x8子块进行亚像素运动估计的细化
M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
}
else if( i_partition == D_8x8 )
{ //如果分区类型是D_8x8,先调用analyse_update_cache函数更新缓存,然后根据宏块的子分区模式,对每个8x8子块进行亚像素运动估计的细化
analyse_update_cache( h, &analysis );
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
{
if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
{
x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
}
else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
{
x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
}
else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
{
x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
}
else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
{
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
}
}
}
}
}
}
else if( h->sh.i_type == SLICE_TYPE_B )
{
int i_bskip_cost = COST_MAX;
int b_skip = 0;
if( analysis.i_mbrd )//则调用mb_init_fenc_cache函数初始化帧编码图像的缓存
mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
h->mb.i_type = B_SKIP;
if( h->mb.b_direct_auto_write )
{
/* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
for( int i = 0; i < 2; i++ )
{
int b_changed = 1;
h->sh.b_direct_spatial_mv_pred ^= 1;
analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
if( analysis.b_direct_available )
{
if( b_changed )
{
x264_mb_mc( h );
b_skip = x264_macroblock_probe_bskip( h );
}
h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
}
else
b_skip = 0;
}
}
else//直接调用x264_mb_predict_mv_direct16x16函数进行宏块的直接运动矢量预测,不进行统计
analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
analysis.b_try_skip = 0;
if( analysis.b_direct_available )
{ //如果!h->mb.b_direct_auto_write为真,调用x264_mb_mc函数进行宏块的亚像素运动补偿
if( !h->mb.b_direct_auto_write )
x264_mb_mc( h );
/* If the current macroblock is off the frame, just skip it. *///如果存在隔行扫描且当前宏块不是隔行扫描宏块,且当前宏块在帧中的位置超过了图像高度的一部分,将跳过标志b_skip设置为1,表示跳过当前宏块的编码
if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
b_skip = 1;
else if( analysis.i_mbrd )
{ //调用ssd_mb函数计算当前宏块的块失真
i_bskip_cost = ssd_mb( h );
/* 6 = minimum cavlc cost of a non-skipped MB *///计算非跳过宏块的最小Cavlc编码开销,即非跳过宏块的最小编码代价
b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
}
else if( !h->mb.b_direct_auto_write )
{ //调用x264_macroblock_probe_bskip函数进行宏块的跳过探测
/* Conditioning the probe on neighboring block types
* doesn't seem to help speed or quality. */
analysis.b_try_skip = x264_macroblock_probe_bskip( h );
if( h->param.analyse.i_subpel_refine < 3 )//如果亚像素运动估计的细化等级小于3,将跳过标志b_skip设置为跳过探测的结果analysis.b_try_skip
b_skip = analysis.b_try_skip;
}
/* Set up MVs for future predictors */
if( b_skip )
{ //对于参考图像列表0中的每个参考图像,将当前宏块的运动矢量设置为0
for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;//对于参考图像列表1中的每个参考图像,将当前宏块的运动矢量设置为0
}
}
if( !b_skip )
{
const unsigned int flags = h->param.analyse.inter;//用于存储分析参数中的inter标志
int i_type;
int i_partition;
int i_satd_inter;
h->mb.b_skip_mc = 0;
h->mb.i_type = B_DIRECT;
//调用mb_analyse_load_costs函数加载分析所需的代价数据
mb_analyse_load_costs( h, &analysis );
/* select best inter mode */
/* direct must be first */
if( analysis.b_direct_available )
mb_analyse_inter_direct( h, &analysis );//如果直接模式可用,调用mb_analyse_inter_direct函数进行直接帧间分析
//调用mb_analyse_inter_b16x16函数进行16x16块的帧间分析
mb_analyse_inter_b16x16( h, &analysis );
if( h->mb.i_type == B_SKIP )
{
for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;//对于参考图像列表0中的每个参考图像,将当前宏块的运动矢量设置为0
for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;//对于参考图像列表1中的每个参考图像,将当前宏块的运动矢量设置为0
return;
}
//根据各种帧间预测模式的代价,选择最佳的帧间预测模式
i_type = B_L0_L0;
i_partition = D_16x16;
i_cost = analysis.l0.me16x16.cost;
COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
{ //如果启用了宏块级B帧率失真优化,并且启用了提前终止,并且直接模式的代价小于等于当前最佳模式代价的33/32倍
mb_analyse_b_rd( h, &analysis, i_cost );//调用mb_analyse_b_rd函数进行B帧率失真优化分析,传递当前最佳模式的代价
if( i_bskip_cost < analysis.i_rd16x16direct &&
i_bskip_cost < analysis.i_rd16x16bi &&
i_bskip_cost < analysis.l0.i_rd16x16 &&
i_bskip_cost < analysis.l1.i_rd16x16 )
{ //如果跳过宏块的代价小于直接模式、双向模式和参考图像列表的代价:
h->mb.i_type = B_SKIP;//将宏块类型设置为B_SKIP
analyse_update_cache( h, &analysis );//更新分析缓存
return;//返回,结束宏块的编码过程
}
}
if( flags & X264_ANALYSE_BSUB16x16 )
{ //根据h->param.analyse.b_mixed_references的值选择相应的帧间分析函数,是混合参考帧的情况下调用mb_analyse_inter_b8x8_mixed_ref函数
if( h->param.analyse.b_mixed_references )
mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
else//否则调用mb_analyse_inter_b8x8函数
mb_analyse_inter_b8x8( h, &analysis );
//通过比较当前模式的代价与之前选择的最佳模式的代价,更新最佳模式的代价、宏块类型和分区类型
COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
/* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
int i_mb_type, i_partition16x8[2], i_partition8x16[2];
for( int i = 0; i < 2; i++ )
{ //通过计算b16x8和b8x16模式的估计代价,选择估计代价较低的模式进行分析。首先计算16x8模式的估计代价,并根据估计代价和lambda值调整代价,然后计算8x16模式的估计代价,并根据估计代价和lambda值调整代价
int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
// 16x8
i_best_cost = COST_MAX;
i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
+ analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
+ analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
analysis.i_cost_est16x8[i] = i_best_cost;
// 8x16
i_best_cost = COST_MAX;
i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
+ analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
+ analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
analysis.i_cost_est8x16[i] = i_best_cost;
}
i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
//接下来,根据是否启用了提前终止和估计代价,选择分析16x8模式或8x16模式
/* We can gain a little speed by checking the mode with the lowest estimated cost first */
int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
{
mb_analyse_inter_b16x8( h, &analysis, i_cost );
COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
}
if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
{
mb_analyse_inter_b8x16( h, &analysis, i_cost );
COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
}
if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
{ //最后,如果需要,根据提前终止和估计代价再次选择分析16x8模式
mb_analyse_inter_b16x8( h, &analysis, i_cost );
COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
}
}
if( analysis.i_mbrd || !h->mb.i_subpel_refine )
{ //如果analysis.i_mbrd为真或者h->mb.i_subpel_refine为假,则跳过细化操作
/* refine later */
}
/* refine qpel */
else if( i_partition == D_16x16 )
{ //如果分区类型为D_16x16,分别对L0和L1参考帧进行细化
analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
if( i_type == B_L0_L0 )
{ //根据预测的代价和lambda值调整代价,然后调用x264_me_refine_qpel函数进行亚像素级的细化操作
x264_me_refine_qpel( h, &analysis.l0.me16x16 );
i_cost = analysis.l0.me16x16.cost
+ analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];//根据细化后的代价和调整后的代价,更新i_cost的值
}
else if( i_type == B_L1_L1 )
{
x264_me_refine_qpel( h, &analysis.l1.me16x16 );
i_cost = analysis.l1.me16x16.cost
+ analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
}
else if( i_type == B_BI_BI )
{
x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
}
}
else if( i_partition == D_16x8 )
{ //果分区类型为D_16x8,循环遍历两个子分区
for( int i = 0; i < 2; i++ )
{ //如果子分区的类型不是D_L1_8x8,则对L0参考帧进行细化操作;
if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )//如果子分区的类型不是D_L0_8x8,则对L1参考帧进行细化操作
x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
}
}
else if( i_partition == D_8x16 )
{ //如果分区类型为D_8x16,与D_16x8类似,循环遍历两个子分区
for( int i = 0; i < 2; i++ )
{ //如果子分区的类型不是D_L1_8x8,则对L0参考帧进行细化操作
if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )//如果子分区的类型不是D_L0_8x8,则对L1参考帧进行细化操作
x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
}
}
else if( i_partition == D_8x8 )
{ //如果分区类型为D_8x8,循环遍历4个子分区
for( int i = 0; i < 4; i++ )
{
x264_me_t *m;
int i_part_cost_old;
int i_type_cost;
int i_part_type = h->mb.i_sub_partition[i];
int b_bidir = (i_part_type == D_BI_8x8);
if( i_part_type == D_DIRECT_8x8 )
continue;
if( x264_mb_partition_listX_table[0][i_part_type] )
{
m = &analysis.l0.me8x8[i];
i_part_cost_old = m->cost;//根据子分区的类型选择相应的参考帧和代价表
i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];//根据预测的代价和lambda值调整代价
m->cost -= i_type_cost;
x264_me_refine_qpel( h, m );//调用x264_me_refine_qpel函数进行亚像素级的细化操作
if( !b_bidir )
analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;//根据细化后的代价和调整后的代价,更新analysis.i_cost8x8bi的值
}
if( x264_mb_partition_listX_table[1][i_part_type] )
{
m = &analysis.l1.me8x8[i];
i_part_cost_old = m->cost;
i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
m->cost -= i_type_cost;
x264_me_refine_qpel( h, m );
if( !b_bidir )
analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
}
/* TODO: update mvp? */
}
}
i_satd_inter = i_cost;
if( analysis.i_mbrd )
{ //则调用mb_analyse_b_rd函数对帧间和帧内分析的结果进行比较
mb_analyse_b_rd( h, &analysis, i_satd_inter );
i_type = B_SKIP;
i_cost = i_bskip_cost;
i_partition = D_16x16;//选择最佳的模式和分区类型。根据不同的比较结果,更新i_type和i_partition的值
COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
h->mb.i_type = i_type;
h->mb.i_partition = i_partition;
}
if( h->mb.b_chroma_me )
{ //如果h->mb.b_chroma_me为真,表示需要进行色度运动估计,根据色度格式的不同,调用相应的函数进行帧内分析
if( CHROMA444 )
{
mb_analyse_intra( h, &analysis, i_satd_inter );
mb_analyse_intra_chroma( h, &analysis );
}
else
{ //先进行色度帧内分析,然后进行帧内分析。最后,更新帧内分析的总代价
mb_analyse_intra_chroma( h, &analysis );
mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
}
analysis.i_satd_i16x16 += analysis.i_satd_chroma;
analysis.i_satd_i8x8 += analysis.i_satd_chroma;
analysis.i_satd_i4x4 += analysis.i_satd_chroma;
}
else
mb_analyse_intra( h, &analysis, i_satd_inter );
if( analysis.i_mbrd )
{ //调用mb_analyse_transform_rd函数对变换系数进行分析,并更新i_satd_inter和i_cost的值
mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );//调用intra_rd函数进行帧内模式的进一步细化
}
//根据不同的比较结果,更新帧内分析的总代价和帧内模式类型
COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
//根据帧内分析的结果更新当前宏块的类型和分区类型
h->mb.i_type = i_type;
h->mb.i_partition = i_partition;
if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
intra_rd_refine( h, &analysis );//则调用intra_rd_refine函数对帧内模式进行细化
if( h->mb.i_subpel_refine >= 5 )//如果h->mb.i_subpel_refine大于等于5,则调用refine_bidir函数对双向运动估计进行细化
refine_bidir( h, &analysis );
if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
{ //如果analysis.i_mbrd大于等于2,并且i_type的取值在B_DIRECT和B_SKIP之间,进入条件判断的代码块
int i_biweight;
analyse_update_cache( h, &analysis );//调用analyse_update_cache函数更新缓存
//然后根据不同的分区类型和帧内模式类型,调用相应的函数进行细化操作
if( i_partition == D_16x16 )
{ //如果分区类型是D_16x16,根据i_type的值进行不同的处理
if( i_type == B_L0_L0 )
{ //如果i_type是B_L0_L0,则更新L0参考帧的16x16运动估计代价,并调用x264_me_refine_qpel_rd函数对L0参考帧的16x16运动估计进行细化
analysis.l0.me16x16.cost = i_cost;
x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
}
else if( i_type == B_L1_L1 )
{ //如果i_type是B_L1_L1,则更新L1参考帧的16x16运动估计代价,并调用x264_me_refine_qpel_rd函数对L1参考帧的16x16运动估计进行细化
analysis.l1.me16x16.cost = i_cost;
x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
}
else if( i_type == B_BI_BI )
{ //如果i_type是B_BI_BI,则获取L0和L1参考帧的权重,并调用x264_me_refine_bidir_rd函数对双向运动估计进行细化
i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
}
}
else if( i_partition == D_16x8 )//如果分区类型是D_16x8
{
for( int i = 0; i < 2; i++ )
{ //则根据analysis.i_mb_partition16x8数组的值进行循环处理
h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )//根据每个分区的类型,调用相应的函数进行细化操作,包括更新参考帧的运动估计代价并调用x264_me_refine_qpel_rd或x264_me_refine_bidir_rd函数进行细化
x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
{
i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
}
}
}
else if( i_partition == D_8x16 )//如果分区类型是D_8x16
{
for( int i = 0; i < 2; i++ )
{
h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
{
i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
}
}
}
else if( i_partition == D_8x8 )//如果分区类型是D_8x8
{
for( int i = 0; i < 4; i++ )
{
if( h->mb.i_sub_partition[i] == D_L0_8x8 )
x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
{
i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
}
}
}
}
}
}
//调用了analyse_update_cache函数来更新缓存
analyse_update_cache( h, &analysis );
//接下来的代码块用于处理一种特殊情况,即在进行四分块运动估计时,可能会通过细化操作回到较大的分区大小,但没有意识到这一点。为了解决这个问题,进行了以下操作
/* In rare cases we can end up qpel-RDing our way back to a larger partition size
* without realizing it. Check for this and account for it if necessary. */
if( analysis.i_mbrd >= 2 )//如果analysis.i_mbrd大于等于2,进入条件判断的代码块
{
/* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};//根据当前宏块的类型h->mb.i_type,确定需要检查的运动矢量列表。在这种情况下,只检查P_L0、B_L0_L0和B_L1_L1这三种类型的运动矢量列表
int list = check_mv_lists[h->mb.i_type] - 1;//如果满足以下条件,则将当前宏块的分区类型设置为D_16x16
if( list >= 0 && h->mb.i_partition != D_16x16 &&
M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
h->mb.i_partition = D_16x16;
}
if( !analysis.i_mbrd )//如果analysis.i_mbrd为0,则调用mb_analyse_transform函数进行宏块变换分析
mb_analyse_transform( h );
if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )//如果analysis.i_mbrd为3且当前宏块类型不是SKIP类型,则调用mb_analyse_qp_rd函数进行QP值优化
mb_analyse_qp_rd( h, &analysis );
h->mb.b_trellis = h->param.analyse.i_trellis;
h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
//如果当前宏块类型不是SKIP类型且h->mb.i_psy_trellis不为0且h->param.analyse.i_trellis为1,则调用psy_trellis_init函数进行心理学优化
if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
psy_trellis_init( h, 0 );
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
}
2.P帧16x16帧间预测mb_analyse_inter_p16x16
代码如下:
static void mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
int i_mvc;
ALIGNED_ARRAY_8( int16_t, mvc,[8],[2] );
int i_halfpel_thresh = INT_MAX;//i_halfpel_thresh用于保存半像素阈值,默认初始化为INT_MAX
int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
/* 16x16 Search on all ref frame */
m.i_pixel = PIXEL_16x16;//进行16x16搜索
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
a->l0.me16x16.cost = INT_MAX;
for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
{
m.i_ref_cost = REF_COST( 0, i_ref );//设置当前参考帧的参考代价
i_halfpel_thresh -= m.i_ref_cost;//更新i_halfpel_thresh的值
/* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );//载当前参考帧的像素和权重像素
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
//使用x264_mb_predict_mv_16x16函数预测运动矢量
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
if( h->mb.ref_blind_dupe == i_ref )
{
CP32( m.mv, a->l0.mvc[0][0] );
x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );//如果h->mb.ref_blind_dupe等于当前参考帧索引,则调用x264_me_refine_qpel_refdupe函数进行四分之一像素精细化操作
}
else
{ //调用x264_mb_predict_mv_ref16x16函数预测参考帧运动矢量
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );//调用x264_me_search_ref函数进行运动估计
}
/* save mv for predicting neighbors */
CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );//复制运动矢量用于预测邻域
CP32( a->l0.mvc[i_ref][0], m.mv );
/* early termination
* SSD threshold would probably be better than SATD *///根据条件进行早期终止判断
if( i_ref == 0
&& a->b_try_skip
&& m.cost-m.cost_mv < 300*a->i_lambda
&& abs(m.mv[0]-h->mb.cache.pskip_mv[0])
+ abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
&& x264_macroblock_probe_pskip( h ) )//数检测到可以跳过当前宏块
{
h->mb.i_type = P_SKIP;//则将当前宏块类型设置为P_SKIP并更新缓存,然后返回
analyse_update_cache( h, a );
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
return;
}
m.cost += m.i_ref_cost;//更新代价信息
i_halfpel_thresh += m.i_ref_cost;
//如果当前参考帧的代价小于最小代价,则将当前参考帧的运动矢量和代价信息保存为最小代价信息
if( m.cost < a->l0.me16x16.cost )
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
}
//将参考帧索引和运动矢量存储到宏块缓存中
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
h->mb.i_type = P_L0;
if( a->i_mbrd )
{
mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
{ //根据条件判断是否将当前宏块的分区类型设置为D_16x16,并更新宏块缓存的运动矢量
h->mb.i_partition = D_16x16;
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 );//计算D_16x16分区类型的残差代价
if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
h->mb.i_type = P_SKIP;//根据条件判断是否将当前宏块类型设置为P_SKIP
}
}
}
3.预测16x16宏块的运动矢量x264_mb_predict_mv_16x16
根据当前块左边、上方、右上方的块获取初始的mvc,代码如下:
void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] )
{
int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];//i_refa和mv_a对应于当前宏块的左侧8x8块的参考帧和运动矢量
int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];//i_refb和mv_b对应于当前宏块的上方8x8块的参考帧和运动矢量
int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];//i_refc和mv_c对应于当前宏块的右上方8x8块的参考帧和运动矢量
if( i_refc == -2 )//如果其值为-2,则表示该块不可用,需要替换为左上方8x8块的左侧8x8块的参考帧和运动矢量
{
i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
}
int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
if( i_count > 1 )
{//如果有两个或三个可用,执行x264_median_mv函数对三个运动矢量进行中位数运算,将结果保存在mvp中
median:
x264_median_mv( mvp, mv_a, mv_b, mv_c );
}
else if( i_count == 1 )
{ //如果只有一个参考帧和运动矢量与当前参考帧相同,将该运动矢量复制到mvp中
if( i_refa == i_ref )
CP32( mvp, mv_a );
else if( i_refb == i_ref )
CP32( mvp, mv_b );
else
CP32( mvp, mv_c );
}//如果右上方和上方8x8块的参考帧和运动矢量不可用,但左侧8x8块的参考帧和运动矢量可用,将左侧8x8块的运动矢量复制到mvp中
else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
CP32( mvp, mv_a );
else
goto median;//如果以上条件都不满足,执行中位数操作
}
4.获取预测运动矢量x264_mb_predict_mv_ref16x16
x264_mb_predict_mv_ref16x16
函数用于预测16x16宏块的运动矢量,主要从参考块中获取,并存储在mvc
数组中。
代码如下:
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc )
{ //函数首先从宏块的运动矢量参考数组mvr中获取当前参考帧和运动矢量的索引mvr[i_list][i_ref]
int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
int i = 0;
#define SET_MVP(mvp) \
{ \
CP32( mvc[i], mvp ); \
i++; \
}
#define SET_IMVP(xy) \
if( xy >= 0 ) \
{ \
int shift = 1 + MB_INTERLACED - h->mb.field[xy]; \
int16_t *mvp = h->mb.mvr[i_list][i_ref<<1>>shift][xy]; \
mvc[i][0] = mvp[0]; \
mvc[i][1] = mvp[1]*2>>shift; \
i++; \
}
/* b_direct *///如果当前帧类型为B帧且参考帧列表中第12个位置的参考帧与当前参考帧相等,则将该位置的运动矢量复制到mvc数组中
if( h->sh.i_type == SLICE_TYPE_B
&& h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref )
{
SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] );
}
//如果当前参考帧为0且当前帧拥有低分辨率运动矢量,则从低分辨率运动矢量中获取对应位置的运动矢量,并将其复制到mvc数组中
if( i_ref == 0 && h->frames.b_have_lowres )
{
int idx = i_list ? h->fref[1][0]->i_frame-h->fenc->i_frame-1
: h->fenc->i_frame-h->fref[0][0]->i_frame-1;
if( idx <= h->param.i_bframe )
{
int16_t (*lowres_mv)[2] = h->fenc->lowres_mvs[i_list][idx];
if( lowres_mv[0][0] != 0x7fff )
{
M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
i++;
}
}
}
/* spatial predictors */
if( SLICE_MBAFF )
{
SET_IMVP( h->mb.i_mb_left_xy[0] );
SET_IMVP( h->mb.i_mb_top_xy );
SET_IMVP( h->mb.i_mb_topleft_xy );
SET_IMVP( h->mb.i_mb_topright_xy );
}
else
{ //同样依次获取左侧、上方、左上方和右上方宏块的运动矢量,并将其复制到mvc数组中
SET_MVP( mvr[h->mb.i_mb_left_xy[0]] );
SET_MVP( mvr[h->mb.i_mb_top_xy] );
SET_MVP( mvr[h->mb.i_mb_topleft_xy] );
SET_MVP( mvr[h->mb.i_mb_topright_xy] );
}
#undef SET_IMVP
#undef SET_MVP
/* temporal predictors */
if( h->fref[0][0]->i_ref[0] > 0 )
{
x264_frame_t *l0 = h->fref[0][0];
int field = h->mb.i_mb_y&1;
int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
int refpoc = h->fref[i_list][i_ref>>SLICE_MBAFF]->i_poc;
refpoc += l0->i_delta_poc[field^(i_ref&1)];
#define SET_TMVP( dx, dy ) \
{ \
int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \
int scale = (curpoc - refpoc) * l0->inv_ref_poc[MB_INTERLACED&field]; \
mvc[i][0] = x264_clip3( (l0->mv16x16[mb_index][0]*scale + 128) >> 8, INT16_MIN, INT16_MAX ); \
mvc[i][1] = x264_clip3( (l0->mv16x16[mb_index][1]*scale + 128) >> 8, INT16_MIN, INT16_MAX ); \
i++; \
}
SET_TMVP(0,0);
if( h->mb.i_mb_x < h->mb.i_mb_width-1 )
SET_TMVP(1,0);
if( h->mb.i_mb_y < h->mb.i_mb_height-1 )
SET_TMVP(0,1);
#undef SET_TMVP
}
*i_mvc = i;//将预测得到的运动矢量数量存储在i_mvc变量中,并返回结果
}
5.帧间运动估计x264_me_search_ref
其中运动估计算法有钻石搜索算法X264_ME_DIA、六边形搜索算法X264_ME_HEX、X264_ME_UMH和X264_ME_TESA算法,其中钻石搜索算法和六边形搜索算法可以参考【x265编码器】章节5——x265帧间运动估计流程,两者基本一致;
代码如下:
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
const int bw = x264_pixel_size[m->i_pixel].w;//块的宽度和高度(bw和bh)
const int bh = x264_pixel_size[m->i_pixel].h;
const int i_pixel = m->i_pixel;//像素大小
const int stride = m->i_stride[0];//步长
int i_me_range = h->param.analyse.i_me_range;//运动估计范围
int bmx, bmy, bcost = COST_MAX;
int bpred_cost = COST_MAX;
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
ALIGNED_ARRAY_32( pixel, pix,[16*16] );//函数计算当前块的像素值,并将其存储在pix数组中
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
ALIGNED_ARRAY_16( int, costs,[16] );
int mv_x_min = h->mb.mv_limit_fpel[0][0];
int mv_y_min = h->mb.mv_limit_fpel[0][1];
int mv_x_max = h->mb.mv_limit_fpel[1][0];
int mv_y_max = h->mb.mv_limit_fpel[1][1];
/* Special version of pack to allow shortcuts in CHECK_MVRANGE */
#define pack16to32_mask2(mx,my) (((uint32_t)(mx)<<16)|((uint32_t)(my)&0x7FFF))
uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
uint32_t pmv, bpred_mv = 0;
#define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
/* Try extra predictors if provided. If subme >= 3, check subpel predictors,
* otherwise round them to fullpel. */
if( h->mb.i_subpel_refine >= 3 )
{
/* Calculate and check the MVP first *///预测运动矢量(MVP)。限制MVP在指定的范围内
int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
pmv = pack16to32_mask( bpred_mx, bpred_my );//并将其舍入为整像素
pmx = FPEL( bpred_mx );//转换为整像素坐标
pmy = FPEL( bpred_my );
COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );//计算主预测运动矢量的成本,并将其存储在bpred_cost中
int pmv_cost = bpred_cost;
if( i_mvc > 0 )//如果存在额外的运动矢量候选项(i_mvc > 0)
{ //则对这些候选项进行剪裁,并排除等于零和主预测运动矢量的候选项。剪裁后的候选项存储在mvc_temp数组中
/* Clip MV candidates and eliminate those equal to zero and pmv. */
int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
if( valid_mvcs > 0 )
{
int i = 1, cost;
/* We stuff pmv here to branchlessly pick between pmv and the various
* MV candidates. [0] gets skipped in order to maintain alignment for
* x264_predictor_clip. */
M32( mvc_temp[1] ) = pmv;
bpred_cost <<= 4;
do
{//遍历剪裁后的候选项,并计算每个候选项的成本。将最佳成本和对应的索引存储在bpred_cost中
int mx = mvc_temp[i+1][0];
int my = mvc_temp[i+1][1];
COST_MV_HPEL( mx, my, cost );
COPY1_IF_LT( bpred_cost, (cost << 4) + i );
} while( ++i <= valid_mvcs );
bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
bpred_my = mvc_temp[(bpred_cost&15)+1][1];
bpred_cost >>= 4;
}
}
/* Round the best predictor back to fullpel and get the cost, since this is where
* we'll be starting the fullpel motion search. */
bmx = FPEL( bpred_mx );//根据最佳候选项的索引,获取最佳预测运动矢量的舍入全像素坐标(bmx和bmy)
bmy = FPEL( bpred_my );
bpred_mv = pack16to32_mask(bpred_mx, bpred_my);//并将其打包为32位整数
if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... *///如果最佳预测运动矢量是子像素级别的,则计算其成本
COST_MV( bmx, bmy );
else /* Otherwise just copy the cost (we already know it) */
bcost = bpred_cost;
/* Test the zero vector if it hasn't been tested yet. */
if( pmv )//接下来,检查是否需要测试零矢量。如果最佳预测运动矢量非零,则计算零矢量的成本
{
if( bmx|bmy ) COST_MV( 0, 0 );
}
/* If a subpel mv candidate was better than the zero vector, the previous
* fullpel check won't have gotten it even if the pmv was zero. So handle
* that possibility here. */
else
{ //如果最佳预测运动矢量为零,则根据之前的比较,判断是否需要更新最佳成本和运动矢量
COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
}
}
else
{ //处理h->mb.i_subpel_refine小于3的情况,即只使用整像素级别的预测器
/* Calculate and check the fullpel MVP first *///计算并检查完整像素级别的主预测运动矢量(MVP)。将MVP限制在指定的范围内
bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );//并将其舍入为整像素(bmx和bmy)。然后,将舍入后的运动矢量(pmx和pmy)转换为全像素坐标
bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
pmv = pack16to32_mask( bmx, bmy );//将舍入后的主预测运动矢量打包为32位整数(pmv)
//因为将预测的运动矢量舍入为整像素,所以在16个案例中会有一个额外的MV成本。然而,当选择预测的MV作为最佳预测器时,通常情况下,子像素搜索会得到一个接近或与预测的运动矢量相邻的矢量。因此,为了避免对使用预测的运动矢量进行不公平的偏见,我们省略了舍入后MVP的MV成本
/* Because we are rounding the predicted motion vector to fullpel, there will be
* an extra MV cost in 15 out of 16 cases. However, when the predicted MV is
* chosen as the best predictor, it is often the case that the subpel search will
* result in a vector at or next to the predicted motion vector. Therefore, we omit
* the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
* the predicted motion vector.
*
* Disclaimer: this is a post-hoc rationalization for why this hack works. */
bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );//根据选择的像素比较函数(h->pixf.fpelcmp[i_pixel]),计算舍入后的MVP与当前帧像素的成本,并将其存储在bcost中
if( i_mvc > 0 )//如果存在额外的运动矢量候选项(i_mvc > 0)
{ //则将这些候选项舍入为整像素级别,并进行类似于subme>=3的处理
/* Like in subme>=3, except we also round the candidates to fullpel. */
int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
if( valid_mvcs > 0 )
{
int i = 1, cost;
M32( mvc_temp[1] ) = pmv;
bcost <<= 4;
do
{ //遍历舍入后的候选项,并计算每个候选项与当前帧像素的成本。将最佳成本和对应的索引存储在bcost中
int mx = mvc_temp[i+1][0];
int my = mvc_temp[i+1][1];
cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
COPY1_IF_LT( bcost, (cost << 4) + i );
} while( ++i <= valid_mvcs );
bmx = mvc_temp[(bcost&15)+1][0];
bmy = mvc_temp[(bcost&15)+1][1];
bcost >>= 4;
}
}
/* Same as above, except the condition is simpler. */
if( pmv )//如果主预测运动矢量非零,则计算零矢量的成本
COST_MV( 0, 0 );
}
switch( h->mb.i_me_method )
{
case X264_ME_DIA://这段代码实现了钻石搜索算法
{
/* diamond search, radius 1 *///半径为1
bcost <<= 4;//将当前的成本(bcost)左移4位,以便在每一步中存储16个子块的成本
int i = i_me_range;
do
{ //使用钻石搜索算法在半径为1的范围内进行运动矢量的搜索。在每一步中,计算四个方向(左、右、上、下)的子块的成本,并与当前最佳成本进行比较。如果有更低的成本,则更新当前最佳成本(bcost
COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
COPY1_IF_LT( bcost, (costs[1]<<4)+3 );
COPY1_IF_LT( bcost, (costs[2]<<4)+4 );
COPY1_IF_LT( bcost, (costs[3]<<4)+12 );
if( !(bcost&15) )//如果当前最佳成本的低4位为0(即成本没有变化),则跳出循环,结束搜索
break;
bmx -= (int32_t)((uint32_t)bcost<<28)>>30;//在每一步中,根据当前最佳成本的低4位,计算运动矢量的偏移量,并更新运动矢量的坐标(bmx和bmy)
bmy -= (int32_t)((uint32_t)bcost<<30)>>30;
bcost &= ~15;//将当前最佳成本的低4位清零,以准备下一步的比较
} while( --i && CHECK_MVRANGE(bmx, bmy) );
bcost >>= 4;//将当前最佳成本的低4位清零,以准备下一步的比较
break;
}
case X264_ME_HEX:
{//六边形搜索(hexagon search)算法
me_hex2:
/* hexagon search, radius 2 *///半径为2
#if 0
for( int i = 0; i < i_me_range/2; i++ )
{
omx = bmx; omy = bmy;
COST_MV( omx-2, omy );
COST_MV( omx-1, omy+2 );
COST_MV( omx+1, omy+2 );
COST_MV( omx+2, omy );
COST_MV( omx+1, omy-2 );
COST_MV( omx-1, omy-2 );
if( bmx == omx && bmy == omy )
break;
if( !CHECK_MVRANGE(bmx, bmy) )
break;
}
#else
/* equivalent to the above, but eliminates duplicate candidates */
/* hexagon *///先计算六边形的成本,然后选择最佳的运动矢量
COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs );
COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */
bcost <<= 3;
COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
COPY1_IF_LT( bcost, (costs[4]<<3)+5 );
COPY1_IF_LT( bcost, (costs[5]<<3)+6 );
COPY1_IF_LT( bcost, (costs[6]<<3)+7 );
if( bcost&7 )
{
int dir = (bcost&7)-2;
bmx += hex2[dir+1][0];
bmy += hex2[dir+1][1];
//在每一步中,计算三个方向的子块的成本,并与当前最佳成本进行比较
/* half hexagon, not overlapping the previous iteration */
for( int i = (i_me_range>>1) - 1; i > 0 && CHECK_MVRANGE(bmx, bmy); i-- )
{
COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1],
hex2[dir+1][0], hex2[dir+1][1],
hex2[dir+2][0], hex2[dir+2][1],
costs );
bcost &= ~7;
COPY1_IF_LT( bcost, (costs[0]<<3)+1 );
COPY1_IF_LT( bcost, (costs[1]<<3)+2 );
COPY1_IF_LT( bcost, (costs[2]<<3)+3 );
if( !(bcost&7) )//如果没有更低的成本或者达到了最佳成本(低3位为0),则跳出循环,结束搜索
break;
dir += (bcost&7)-2;
dir = mod6m1[dir+1];
bmx += hex2[dir+1][0];
bmy += hex2[dir+1][1];
}
}
bcost >>= 3;
#endif
/* square refine *///最后,进行方形细化搜索,计算方形周围的子块的成本,并更新运动矢量的坐标
bcost <<= 4;
COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
COPY1_IF_LT( bcost, (costs[1]<<4)+2 );
COPY1_IF_LT( bcost, (costs[2]<<4)+3 );
COPY1_IF_LT( bcost, (costs[3]<<4)+4 );
COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs );
COPY1_IF_LT( bcost, (costs[0]<<4)+5 );
COPY1_IF_LT( bcost, (costs[1]<<4)+6 );
COPY1_IF_LT( bcost, (costs[2]<<4)+7 );
COPY1_IF_LT( bcost, (costs[3]<<4)+8 );
bmx += square1[bcost&15][0];
bmy += square1[bcost&15][1];
bcost >>= 4;//最终,将当前最佳成本右移4位,得到最终的运动矢量成本(bcost)
break;
}
case X264_ME_UMH:
{
/* Uneven-cross Multi-Hexagon-grid Search
* as in JM, except with different early termination */
static const uint8_t pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
int ucost1, ucost2;
int cross_start = 1;
/* refine predictors */
ucost1 = bcost;
DIA1_ITER( pmx, pmy );
if( pmx | pmy )
DIA1_ITER( 0, 0 );
if( i_pixel == PIXEL_4x4 )
goto me_hex2;
ucost2 = bcost;
if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
DIA1_ITER( bmx, bmy );
if( bcost == ucost2 )
cross_start = 3;
omx = bmx; omy = bmy;
/* early termination */
#define SAD_THRESH(v) ( bcost < ( v >> pixel_size_shift[i_pixel] ) )
if( bcost == ucost2 && SAD_THRESH(2000) )
{
COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 );
if( bcost == ucost1 && SAD_THRESH(500) )
break;
if( bcost == ucost2 )
{
int range = (i_me_range>>1) | 1;
CROSS( 3, range, range );
COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
if( bcost == ucost2 )
break;
cross_start = range + 2;
}
}
/* adaptive search range */
if( i_mvc )
{
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
* these have not been tuned much by actual encoding. */
static const uint8_t range_mul[4][4] =
{
{ 3, 3, 4, 4 },
{ 3, 4, 4, 4 },
{ 4, 4, 4, 5 },
{ 4, 4, 5, 6 },
};
int mvd;
int sad_ctx, mvd_ctx;
int denom = 1;
if( i_mvc == 1 )
{
if( i_pixel == PIXEL_16x16 )
/* mvc is probably the same as mvp, so the difference isn't meaningful.
* but prediction usually isn't too bad, so just use medium range */
mvd = 25;
else
mvd = abs( m->mvp[0] - mvc[0][0] )
+ abs( m->mvp[1] - mvc[0][1] );
}
else
{
/* calculate the degree of agreement between predictors. */
/* in 16x16, mvc includes all the neighbors used to make mvp,
* so don't count mvp separately. */
denom = i_mvc - 1;
mvd = 0;
if( i_pixel != PIXEL_16x16 )
{
mvd = abs( m->mvp[0] - mvc[0][0] )
+ abs( m->mvp[1] - mvc[0][1] );
denom++;
}
mvd += x264_predictor_difference( mvc, i_mvc );
}
sad_ctx = SAD_THRESH(1000) ? 0
: SAD_THRESH(2000) ? 1
: SAD_THRESH(4000) ? 2 : 3;
mvd_ctx = mvd < 10*denom ? 0
: mvd < 20*denom ? 1
: mvd < 40*denom ? 2 : 3;
i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2;
}
/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
* we are still centered on the same place as the DIA2. is this desirable? */
CROSS( cross_start, i_me_range, i_me_range>>1 );
COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );
/* hexagon grid */
omx = bmx; omy = bmy;
const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
int i = 1;
do
{
static const int8_t hex4[16][2] = {
{ 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
{-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
{-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
{-4, 2}, { 4, 2}, {-2, 3}, { 2, 3},
};
if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
mv_y_max-omy, omy-mv_y_min ) )
{
for( int j = 0; j < 16; j++ )
{
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
if( CHECK_MVRANGE(mx, my) )
COST_MV( mx, my );
}
}
else
{
int dir = 0;
pixel *pix_base = p_fref_w + omx + (omy-4*i)*stride;
int dy = i*stride;
#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base x0*i+(y0-2*k+4)*dy,\
pix_base x1*i+(y1-2*k+4)*dy,\
pix_base x2*i+(y2-2*k+4)*dy,\
pix_base x3*i+(y3-2*k+4)*dy,\
stride, costs+4*k );\
pix_base += 2*dy;
#define ADD_MVCOST(k,x,y) costs[k] += p_cost_omvx[x*4*i] + p_cost_omvy[y*4*i]
#define MIN_MV(k,x,y) COPY2_IF_LT( bcost, costs[k], dir, x*16+(y&15) )
SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 );
SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 );
SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 );
SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 );
ADD_MVCOST( 0, 0,-4 );
ADD_MVCOST( 1, 0, 4 );
ADD_MVCOST( 2,-2,-3 );
ADD_MVCOST( 3, 2,-3 );
ADD_MVCOST( 4,-4,-2 );
ADD_MVCOST( 5, 4,-2 );
ADD_MVCOST( 6,-4,-1 );
ADD_MVCOST( 7, 4,-1 );
ADD_MVCOST( 8,-4, 0 );
ADD_MVCOST( 9, 4, 0 );
ADD_MVCOST( 10,-4, 1 );
ADD_MVCOST( 11, 4, 1 );
ADD_MVCOST( 12,-4, 2 );
ADD_MVCOST( 13, 4, 2 );
ADD_MVCOST( 14,-2, 3 );
ADD_MVCOST( 15, 2, 3 );
MIN_MV( 0, 0,-4 );
MIN_MV( 1, 0, 4 );
MIN_MV( 2,-2,-3 );
MIN_MV( 3, 2,-3 );
MIN_MV( 4,-4,-2 );
MIN_MV( 5, 4,-2 );
MIN_MV( 6,-4,-1 );
MIN_MV( 7, 4,-1 );
MIN_MV( 8,-4, 0 );
MIN_MV( 9, 4, 0 );
MIN_MV( 10,-4, 1 );
MIN_MV( 11, 4, 1 );
MIN_MV( 12,-4, 2 );
MIN_MV( 13, 4, 2 );
MIN_MV( 14,-2, 3 );
MIN_MV( 15, 2, 3 );
#undef SADS
#undef ADD_MVCOST
#undef MIN_MV
if( dir )
{
bmx = omx + i*(dir>>4);
bmy = omy + i*((int32_t)((uint32_t)dir<<28)>>28);
}
}
} while( ++i <= i_me_range>>2 );
if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min )
goto me_hex2;
break;
}
case X264_ME_ESA:
case X264_ME_TESA:
{
const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
/* SEA is fastest in multiples of 4 */
const int width = (max_x - min_x + 3) & ~3;
#if 0
/* plain old exhaustive search */
for( int my = min_y; my <= max_y; my++ )
for( int mx = min_x; mx < min_x + width; mx++ )
COST_MV( mx, my );
#else
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
uint16_t *sums_base = m->integral;
ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( (pixel*)x264_zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
FENC_STRIDE, enc_dc );
if( delta == 4 )
sums_base += stride * (h->fenc->i_lines[0] + PADV*2);
if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
delta *= stride;
if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
enc_dc[1] = enc_dc[2];
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
mvsad_t *mvsads = (mvsad_t *)(xs + ((width+31)&~31) + 4);
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
+ BITS_MVD( bmx, bmy );
for( int my = min_y; my <= max_y; my++ )
{
int i;
int ycost = p_cost_mvy[my*4];
if( bsad <= ycost )
continue;
bsad -= ycost;
xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
cost_fpel_mvx+min_x, xs, width, bsad * 17 >> 4 );
for( i = 0; i < xn-2; i += 3 )
{
pixel *ref = p_fref_w+min_x+my*stride;
ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( int j = 0; j < 3; j++ )
{
int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
if( sad < bsad*sad_thresh>>3 )
{
COPY1_IF_LT( bsad, sad );
mvsads[nmvsad].sad = sad + ycost;
mvsads[nmvsad].mv[0] = min_x+xs[i+j];
mvsads[nmvsad].mv[1] = my;
nmvsad++;
}
}
}
for( ; i < xn; i++ )
{
int mx = min_x+xs[i];
int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride )
+ cost_fpel_mvx[xs[i]];
if( sad < bsad*sad_thresh>>3 )
{
COPY1_IF_LT( bsad, sad );
mvsads[nmvsad].sad = sad + ycost;
mvsads[nmvsad].mv[0] = mx;
mvsads[nmvsad].mv[1] = my;
nmvsad++;
}
}
bsad += ycost;
}
limit = i_me_range >> 1;
sad_thresh = bsad*sad_thresh>>3;
while( nmvsad > limit*2 && sad_thresh > bsad )
{
int i = 0;
// halve the range if the domain is too large... eh, close enough
sad_thresh = (sad_thresh + bsad) >> 1;
while( i < nmvsad && mvsads[i].sad <= sad_thresh )
i++;
for( int j = i; j < nmvsad; j++ )
{
uint32_t sad;
if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 )
{
uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] );
#if WORDS_BIGENDIAN
mvsad >>= 32;
#endif
sad = mvsad;
}
else
{
sad = mvsads[j].sad;
CP32( mvsads[i].mv, mvsads[j].mv );
mvsads[i].sad = sad;
}
i += (sad - (sad_thresh+1)) >> 31;
}
nmvsad = i;
}
while( nmvsad > limit )
{
int bi = 0;
for( int i = 1; i < nmvsad; i++ )
if( mvsads[i].sad > mvsads[bi].sad )
bi = i;
nmvsad--;
if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
CP64( &mvsads[bi], &mvsads[nmvsad] );
else
mvsads[bi] = mvsads[nmvsad];
}
for( int i = 0; i < nmvsad; i++ )
COST_MV( mvsads[i].mv[0], mvsads[i].mv[1] );
}
else
{
// just ADS and SAD
for( int my = min_y; my <= max_y; my++ )
{
int i;
int ycost = p_cost_mvy[my*4];
if( bcost <= ycost )
continue;
bcost -= ycost;
xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
cost_fpel_mvx+min_x, xs, width, bcost );
for( i = 0; i < xn-2; i += 3 )
COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
bcost += ycost;
for( ; i < xn; i++ )
COST_MV( min_x+xs[i], my );
}
}
#endif
}
break;
}
/* -> qpel mv *///首先将六边形搜索得到的运动矢量(bmx, bmy)通过pack16to32_mask函数打包为一个32位的运动矢量bmv
uint32_t bmv = pack16to32_mask(bmx,bmy);
uint32_t bmv_spel = SPELx2(bmv);//接着,将bmv扩展为一个更加精确的运动矢量bmv_spel。这里使用了函数SPELx2对bmv进行扩展
if( h->mb.i_subpel_refine < 3 )
{ //计算运动矢量的代价m->cost_mv,代价计算方式是通过查表p_cost_mvx和p_cost_mvy获得bmx和bmy对应的代价,并进行累加
m->cost_mv = p_cost_mvx[bmx*4] + p_cost_mvy[bmy*4];
m->cost = bcost;
/* compute the real cost *///如果当前运动矢量bmv等于预测运动矢量pmv,则再将m->cost_mv累加到总代价中
if( bmv == pmv ) m->cost += m->cost_mv;
M32( m->mv ) = bmv_spel;//将精细化后的运动矢量bmv_spel赋值给m->mv,其中M32(m->mv)是将32位的运动矢量写入内存
}
else
{
M32(m->mv) = bpred_cost < bcost ? bpred_mv : bmv_spel;
m->cost = X264_MIN( bpred_cost, bcost );//将总代价m->cost赋值为bpred_cost和bcost中较小的一个
}
/* subpel refine *///如果h->mb.i_subpel_refine大于等于2,表示还需要进行更精细的亚像素插值处理
if( h->mb.i_subpel_refine >= 2 )
{ //根据h->mb.i_subpel_refine的值选择相应的亚像素迭代次数,然后调用refine_subpel函数进行亚像素插值处理
int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
}
}