x264代码剖析（十四）：核心算法之宏块编码函数x264_macroblock

x264代码剖析（十四）：核心算法之宏块编码函数x264_macroblock_encode()

宏块编码函数x264_macroblock_encode()是完成变换与量化的主要函数，而x264_macroblock_encode()调用了x264_macroblock_encode_internal()函数，在x264_macroblock_encode_internal()函数中，主要完成了如下功能：

x264_macroblock_encode_skip()：编码Skip类型宏块。

x264_mb_encode_i16x16()：编码Intra16x16类型的宏块。该函数除了进行DCT变换之外，还对16个小块的DC系数进行了Hadamard变换。

x264_mb_encode_i4x4()：编码Intra4x4类型的宏块。

帧间宏块编码：这一部分代码直接写在了函数体里面。

x264_mb_encode_chroma()：编码色度块。

x264_macroblock_encode()函数与x264_macroblock_encode_internal()函数都处于encoder文件夹内的macroblock.c中，其调用关系图如下所示：

1、x264_macroblock_encode()函数

x264_macroblock_encode()函数处于encoder文件夹内的macroblock.c中，x264_macroblock_encode()封装了x264_macroblock_encode_internal()。如果色度模式是YUV444的话，传递的参数plane_count=3而chroma=0；如果不是YUV444的话，传递的参数plane_count=1而chroma=1。

对应的代码如下：

/******************************************************************/
/******************************************************************/
/*
======Analysed by RuiDong Fang
======Csdn Blog:http://blog.csdn.net/frd2009041510
======Date:2016.03.22
 */
/******************************************************************/
/******************************************************************/

/************====== 宏块编码函数x264_macroblock_encode() ======************/
/*
功能：x264_macroblock_encode()封装了x264_macroblock_encode_internal(),即编码的内部函数——残差DCT变换、量化
*/
void x264_macroblock_encode( x264_t *h )
{
    if( CHROMA444 )
        x264_macroblock_encode_internal( h, 3, 0 );//YUV444相当于把YUV3个分量都当做Y编码
    else
        x264_macroblock_encode_internal( h, 1, 1 );
}

2、x264_macroblock_encode_internal()函数

x264_macroblock_encode_internal()函数也处于encoder文件夹内的macroblock.c中，具体的代码分析如下：

/************====== 宏块编码函数x264_macroblock_encode_internal() ======************/
/*
功能：调用了编码-残差DCT变换、量化-内部函数
*/
/*****************************************************************************
 * x264_macroblock_encode:
 *****************************************************************************/
static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
{
    int i_qp = h->mb.i_qp;
    int b_decimate = h->mb.b_dct_decimate;
    int b_force_no_skip = 0;
    int nz;
    h->mb.i_cbp_luma = 0;
    for( int p = 0; p < plane_count; p++ )
        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;

	/*======== PCM ========*/
    if( h->mb.i_type == I_PCM )//PCM
    {
        /* if PCM is chosen, we need to store reconstructed frame data */
        for( int p = 0; p < plane_count; p++ )
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
        if( chroma )
        {
            int height = 16 >> CHROMA_V_SHIFT;
            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
        }
        return;
    }

    if( !h->mb.b_allow_skip )
    {
        b_force_no_skip = 1;
        if( IS_SKIP(h->mb.i_type) )
        {
            if( h->mb.i_type == P_SKIP )
                h->mb.i_type = P_L0;
            else if( h->mb.i_type == B_SKIP )
                h->mb.i_type = B_DIRECT;
        }
    }

	//根据不同的宏块类型，进行编码
	/*======== P-skip ========*/
    if( h->mb.i_type == P_SKIP )
    {
        /* don't do pskip motion compensation if it was already done in macroblock_analyse */
        if( !h->mb.b_skip_mc )
        {
            int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
                                  h->mb.mv_min[0], h->mb.mv_max[0] );
            int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
                                  h->mb.mv_min[1], h->mb.mv_max[1] );

            for( int p = 0; p < plane_count; p++ )
                h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
                               &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
                               mvx, mvy, 16, 16, &h->sh.weight[0][p] );

            if( chroma )
            {
                int v_shift = CHROMA_V_SHIFT;
                int height = 16 >> v_shift;

                /* Special case for mv0, which is (of course) very common in P-skip mode. */
                if( mvx | mvy )
                    h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                     h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
                                     mvx, 2*mvy>>v_shift, 8, height );
                else
                    h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
                                                         h->mb.pic.i_stride[1], height );

                if( h->sh.weight[0][1].weightfn )
                    h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                                       h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                                       &h->sh.weight[0][1], height );
                if( h->sh.weight[0][2].weightfn )
                    h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                                       h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                                       &h->sh.weight[0][2], height );
            }
        }

        x264_macroblock_encode_skip( h );	编码skip类型宏块
        return;
    }

	/*======== B-skip ========*/
    if( h->mb.i_type == B_SKIP )
    {
        /* don't do bskip motion compensation if it was already done in macroblock_analyse */
        if( !h->mb.b_skip_mc )
            x264_mb_mc( h );
        x264_macroblock_encode_skip( h );	编码skip类型宏块
        return;
    }

	/*======== 帧内 ========*/
    if( h->mb.i_type == I_16x16 )
    {
        h->mb.b_transform_8x8 = 0;

        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
            x264_mb_encode_i16x16( h, p, i_qp );	如果是Intra16x16类型，调用x264_mb_encode_i16x16()编码宏块(分别编码Y，U，V)
    }
    else if( h->mb.i_type == I_8x8 )
    {
        h->mb.b_transform_8x8 = 1;
        /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
        if( h->mb.i_skip_intra )
        {
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
            /* In RD mode, restore the now-overwritten DCT data. */
            if( h->mb.i_skip_intra == 2 )
                h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
        }
        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
        {
            for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
            {
                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
                x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );	如果是Intra8x8类型，循环4次调用x264_mb_encode_i8x8()编码宏块
            }
        }
    }
    else if( h->mb.i_type == I_4x4 )
    {
        h->mb.b_transform_8x8 = 0;
        /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
        if( h->mb.i_skip_intra )
        {
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
            /* In RD mode, restore the now-overwritten DCT data. */
            if( h->mb.i_skip_intra == 2 )
                h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
        }
        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
        {
            for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
            {
                pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];

                if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                    /* emulate missing topright samples */
                    MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );

                x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );	如果是Intra4x4类型，循环16次调用x264_mb_encode_i4x4()编码宏块
            }
        }
    }
	/*======== 帧间 ========*/
    else    /* Inter MB */
    {
        int i_decimate_mb = 0;

        /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
        if( !h->mb.b_skip_mc )
            x264_mb_mc( h );

        if( h->mb.b_lossless )//===================lossless情况
        {
            if( h->mb.b_transform_8x8 )
                for( int p = 0; p < plane_count; p++ )
                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                    {
                        int x = i8x8&1;
                        int y = i8x8>>1;
                        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
                                                                           h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
                        STORE_8x8_NNZ( p, i8x8, nz );
                        h->mb.i_cbp_luma |= nz << i8x8;
                    }
            else
                for( int p = 0; p < plane_count; p++ )
                    for( int i4x4 = 0; i4x4 < 16; i4x4++ )
                    {
                        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
                                                 h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
                                                 h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
                        h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
                        h->mb.i_cbp_luma |= nz << (i4x4>>2);
                    }
        }
        else if( h->mb.b_transform_8x8 )//===================DCT8x8情况
        {
            ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
            b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC

            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
            {
                CLEAR_16x16_NNZ( p );
                h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
                h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;

                int plane_cbp = 0;
                for( int idx = 0; idx < 4; idx++ )
                {
                    nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );

                    if( nz )
                    {
                        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
                        if( b_decimate )
                        {
                            int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
                            i_decimate_mb += i_decimate_8x8;
                            if( i_decimate_8x8 >= 4 )
                                plane_cbp |= 1<<idx;
                        }
                        else
                            plane_cbp |= 1<<idx;
                    }
                }

                if( i_decimate_mb >= 6 || !b_decimate )
                {
                    h->mb.i_cbp_luma |= plane_cbp;
                    FOREACH_BIT( idx, 0, plane_cbp )
                    {
                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
                        STORE_8x8_NNZ( p, idx, 1 );
                    }
                }
            }
        }
        else//===================最普通的情况
        {
            // 帧间预测：16x16 宏块被划分为8x8,每个8x8再次被划分为4x4
			ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
            {
                CLEAR_16x16_NNZ( p );
                
				//16x16DCT（实际上分解为16个4x4DCT）  
                //求编码帧p_fenc和重建帧p_fdec之间的残差，然后进行DCT变换
				h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );	///对16x16块调用x264_dct_function_t的sub16x16_dct()汇编函数，求得编码宏块数据p_fenc与重建宏块数据p_fdec之间的残差（“sub”），并对残差进行DCT变换

                if( h->mb.b_noise_reduction )
                {
                    h->nr_count[0+!!p*2] += 16;
                    for( int idx = 0; idx < 16; idx++ )
                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
                }

                int plane_cbp = 0;

                //16x16的块分成4个8x8的块
				for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                {
                    int i_decimate_8x8 = b_decimate ? 0 : 6;
                    int nnz8x8 = 0;
                    if( h->mb.b_trellis )
                    {
                        for( int i4x4 = 0; i4x4 < 4; i4x4++ )
                        {
                            int idx = i8x8*4+i4x4;
                            if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
                            {
                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
                                if( i_decimate_8x8 < 6 )
                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
                                nnz8x8 = 1;
                            }
                        }
                    }
                    else
                    {
                        //8x8的块分成4个4x4的块，每个4x4的块再分别进行量化
						nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );	/分成4个8x8的块，对每个8x8块分别调用x264_quant_function_t的quant_4x4x4()汇编函数进行量化
                        if( nz )
                        {
                            FOREACH_BIT( idx, i8x8*4, nz )
                            {
                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );//建立重建帧
                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );	//分成16个4x4的块，对每个4x4块分别调用x264_quant_function_t的dequant_4x4()汇编函数进行反量化（用于重建帧）
                                if( i_decimate_8x8 < 6 )
                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
                            }
                        }
                    }
                    if( nnz8x8 )
                    {
                        i_decimate_mb += i_decimate_8x8;
                        if( i_decimate_8x8 < 4 )
                            STORE_8x8_NNZ( p, i8x8, 0 );
                        else
                            plane_cbp |= 1<<i8x8;
                    }
                }

                if( i_decimate_mb < 6 )
                {
                    plane_cbp = 0;
                    CLEAR_16x16_NNZ( p );
                }
                else
                {
                    h->mb.i_cbp_luma |= plane_cbp;
                    FOREACH_BIT( i8x8, 0, plane_cbp )
                    {
                        //用于建立重建帧  
                        //残差进行DCT反变换之后，叠加到预测数据上
						h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );	分成4个8x8的块，对每个8x8块分别调用x264_dct_function_t的add8x8_idct()汇编函数，对残差进行DCT反变换，并将反变换后的数据叠加（“add”）至预测数据上（用于重建帧）
                    }
                }
            }
        }
    }

    /* encode chroma */
    if( chroma )
    {
        if( IS_INTRA( h->mb.i_type ) )
        {
            int i_mode = h->mb.i_chroma_pred_mode;
            if( h->mb.b_lossless )
                x264_predict_lossless_chroma( h, i_mode );
            else
            {
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
            }
        }

        /* encode the 8x8 blocks */
        x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );	/编码色度块
    }
    else
        h->mb.i_cbp_chroma = 0;

    /* store cbp */
    int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
    if( h->param.b_cabac )
        cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
    h->mb.cbp[h->mb.i_mb_xy] = cbp;

    /* Check for P_SKIP
     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
     *      (if multiple mv give same result)*/
    if( !b_force_no_skip )
    {
        if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
            !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
            && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
        {
            h->mb.i_type = P_SKIP;
        }

        /* Check for B_SKIP */
        if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
        {
            h->mb.i_type = B_SKIP;
        }
    }
}

从源代码可以看出，x264_macroblock_encode_internal()的流程大致如下：

（1）、如果是Skip类型，调用x264_macroblock_encode_skip()编码宏块。

（2）、如果是Intra16x16类型，调用x264_mb_encode_i16x16()编码宏块。

（3）、如果是Intra4x4类型，循环16次调用x264_mb_encode_i4x4()编码宏块。

（4）、如果是Inter类型，则不再调用子函数，而是直接进行编码：

a)、对16x16块调用x264_dct_function_t的sub16x16_dct()汇编函数，求得编码宏块数据p_fenc与重建宏块数据p_fdec之间的残差（“sub”），并对残差进行DCT变换。

b)、分成4个8x8的块，对每个8x8块分别调用x264_quant_function_t的quant_4x4x4()汇编函数进行量化。

c)、分成16个4x4的块，对每个4x4块分别调用x264_quant_function_t的dequant_4x4()汇编函数进行反量化（用于重建帧）。

d)、分成4个8x8的块，对每个8x8块分别调用x264_dct_function_t的add8x8_idct()汇编函数，对残差进行DCT反变换，并将反变换后的数据叠加（“add”）至预测数据上（用于重建帧）。

（5）、如果对色度编码，调用x264_mb_encode_chroma()。

从Inter宏块编码的步骤可以看出，编码就是“DCT变换+量化”两步的组合。在接下来的文章中将依次分析变换、量化的具体代码。