文章目录
对不起各位,本篇博文鸽了好久的下一期,主要是由于本人在校期间被分到了另外的方向,所以搁置了传统编解码的学习,目前正在考虑重新回归编解码坑,博文可能会再次更新~
参考AVS3代码阅读(HPM6.0)(一.整体架构以及CU划分部分代码)
版本:HPM4.0
核心文件:app_encoder.c
结构体
ENC_PICO:原图缓冲结构
/*****************************************************************************
* original picture buffer structure原图缓冲结构ENC_PICO
*****************************************************************************/
typedef struct _ENC_PICO
{
/* original picture store原图存储 */
COM_PIC pic;
/* input picture count输入图片计数 */
u32 pic_icnt;
/* be used for encoding input是否被encoding使用 */
u8 is_used;
/* address of sub-picture子图的地址 */
COM_PIC * spic;
} ENC_PICO;
ENC_PINTRA:帧内预测结构
/*****************************************************************************
* intra prediction structure帧内预测结构ENC_PINTRA
*****************************************************************************/
typedef struct _ENC_PINTRA
{
/* temporary prediction buffer 预测缓冲区*/
pel pred[N_C][MAX_CU_DIM];
pel pred_cache[IPD_CNT][MAX_CU_DIM]; // only for luma
/* reconstruction buffer 重建缓冲区*/
pel rec[N_C][MAX_CU_DIM];
/* address of original (input) picture buffer 输入的原始图像缓冲区的地址*/
pel * addr_org[N_C];
/* stride of original (input) picture buffer 输入的原始图像缓冲区的步长*/
int stride_org[N_C];
/* address of reconstruction picture buffer 重建的图像缓冲区的地址*/
pel * addr_rec_pic[N_C];
/* stride of reconstruction picture buffer 重建的图像缓冲区的步长*/
int stride_rec[N_C];
/* QP for luma 亮度的QP*/
u8 qp_y;
/* QP for chroma 色度的QP*/
u8 qp_u;
u8 qp_v;
int slice_type;
int complexity;
void * pdata[4];
int * ndata[4];
int bit_depth;
} ENC_PINTRA;
ENC_PINTER:帧间预测结构
/*****************************************************************************
* inter prediction structure 帧间预测结构ENC_PINTER
*****************************************************************************/
typedef struct _ENC_PINTER ENC_PINTER;
struct _ENC_PINTER
{
int bit_depth;
/* temporary prediction buffer (only used for ME)预测缓冲区,仅用于ME*/
pel pred_buf[MAX_CU_DIM];
/* reconstruction buffer 重建缓冲区*/
pel rec_buf[N_C][MAX_CU_DIM];
//MVP:运动矢量预测
s16 mvp_scale[REFP_NUM][MAX_NUM_ACTIVE_REF_FRAME][MV_D];
s16 mv_scale[REFP_NUM][MAX_NUM_ACTIVE_REF_FRAME][MV_D];
//MVR:运动矢量精度
u8 curr_mvr;
int max_imv[MV_D];
int max_search_range;
CPMV affine_mvp_scale[REFP_NUM][MAX_NUM_ACTIVE_REF_FRAME][VER_NUM][MV_D];
CPMV affine_mv_scale[REFP_NUM][MAX_NUM_ACTIVE_REF_FRAME][VER_NUM][MV_D];
int best_mv_uni[REFP_NUM][MAX_NUM_ACTIVE_REF_FRAME][MV_D];
pel p_error[MAX_CU_DIM];
int i_gradient[2][MAX_CU_DIM];
s16 org_bi[MAX_CU_DIM];
s32 mot_bits[REFP_NUM];
u8 num_refp;
/* minimum clip value */
s16 min_mv_offset[MV_D];
/* maximum clip value */
s16 max_mv_offset[MV_D];
/* search range for int-pel*/
s16 search_range_ipel[MV_D];
/* search range for sub-pel*/
s16 search_range_spel[MV_D];
s8 (*search_pattern_hpel)[2];
u8 search_pattern_hpel_cnt;
s8 (*search_pattern_qpel)[2];
u8 search_pattern_qpel_cnt;
/* original (input) picture buffer 输入原始图像缓冲区*/
COM_PIC *pic_org;
/* address of original (input) picture buffer 输入原始图像缓冲区的地址*/
pel *Yuv_org[N_C];
/* stride of original (input) picture buffer 输入原始图像缓冲区的步长*/
int stride_org[N_C];
/* motion vector map 运动矢量图*/
s16 (*map_mv)[REFP_NUM][MV_D];
/* picture width in SCU unit 子块的图像宽度*/
int pic_width_in_scu;
/* QP for luma of current encoding CU 当前编码CU的亮度QP*/
int qp_y;
/* QP for chroma of current encoding CU 当前编码CU的色度QP*/
int qp_u;
int qp_v;
u32 lambda_mv;
/* reference pictures 参考图像们*/
COM_REFP (*refp)[REFP_NUM];
int slice_type;
/* search level for motion estimation 运动估计搜索级别*/
int me_level;
int complexity;
void *pdata[4];
int *ndata[4];
/* current frame number 当前帧的序号*/
int ptr;
/* gop size GOP的大小*/
int gop_size;
/* ME function (Full-ME or Fast-ME) 运动估计的函数:是完整的运动估计还是快速运动估计*/
u32 (*fn_me)(ENC_PINTER *pi, int x, int y, int w, int h, int cu_x, int cu_y, int cu_stride, s8 *refi, int lidx, s16 mvp[MV_D], s16 mv[MV_D], int bi);
/* AFFINE ME function (Gradient-ME) 放射变换运动估计*/
u32 (*fn_affine_me)(ENC_PINTER *pi, int x, int y, int cu_width_log2, int cu_height_log2, s8 *refi, int lidx, CPMV mvp[VER_NUM][MV_D], CPMV mv[VER_NUM][MV_D], int bi, int vertex_num, int sub_w, int sub_h);
};
以下两个留作以后再议,目前不是很清楚在干什么。
/* ME function (Full-ME or Fast-ME) 运动估计的函数:是完整的运动估计还是快速运动估计*/
u32 (*fn_me)(ENC_PINTER *pi, int x, int y, int w, int h, int cu_x, int cu_y, int cu_stride, s8 *refi, int lidx, s16 mvp[MV_D], s16 mv[MV_D], int bi);
/* AFFINE ME function (Gradient-ME) 放射变换运动估计*/
u32 (*fn_affine_me)(ENC_PINTER *pi, int x, int y, int cu_width_log2, int cu_height_log2, s8 *refi, int lidx, CPMV mvp[VER_NUM][MV_D], CPMV mv[VER_NUM][MV_D], int bi, int vertex_num, int sub_w, int sub_h);
ENC_PARAM:编码器参数
/* encoder parameter 编码器参数:ENC_PARAM*/
typedef struct _ENC_PARAM
{
#if FIX116_PIC_SIZE
/* picture size of input sequence (width) 输入的视频序列的宽*/
int horizontal_size;
/* picture size of input sequence (height) 输入的视频序列的高*/
int vertical_size;
/* picture size of pictures in DPB (width) 解码图像缓冲区中的宽*/
int pic_width; // be a multiple of 8 (MINI_SIZE)
/* picture size of pictures in DPB (height) 解码图像缓冲区中的高*/
int pic_height; // be a multiple of 8 (MINI_SIZE)
#else
/* picture size of input sequence (width) */
int pic_width;
/* picture size of input sequence (height) */
int pic_height;
#endif
/* qp value for I- and P- slice I和P的QP值*/
int qp;
/* frame per second 每秒的帧数*/
int fps;
/* I-frame period I帧长度*/
int i_period;
/* force I-frame 这个不是很清楚哎*/
int f_ifrm;
/* picture bit depth 图像的比特深度*/
int bit_depth_input;
int bit_depth_internal;
/* use picture signature embedding 使用图片签名嵌入*/
int use_pic_sign;
int max_b_frames;
/* start bumping process if force_output is on 如果force_output为on,则启动bumping进程。*/
int force_output;
int disable_hgop;
int gop_size;
int use_dqp;
#if USE_SLICE_DQP
int frame_qp_add; /* 10 bits*/
#endif
#if IPCM
int ipcm_enable_flag;
#endif
int amvr_enable_flag; //AMVR的使用标志 自适应运动矢量精度
int affine_enable_flag; //affine的使用标志 仿射变换
int smvd_enable_flag; //smvd的使用标志 对称MVD编码
int use_deblock;
int num_of_hmvp_cand; //HMVP的候选个数 基于历史信息的运动矢量预测
int ipf_flag;
#if TSCPM
int tscpm_enable_flag;
#endif
int umve_enable_flag;
#if EXT_AMVR_HMVP
int emvr_enable_flag;
#endif
#if DT_PARTITION
int dt_intra_enable_flag;
#endif
int wq_enable; // 加权量化的启用与否
int seq_wq_mode; //序列的加权量化模式
char seq_wq_user[2048];
int pic_wq_data_idx;
char pic_wq_user[2048];
int wq_param; //加权量化参数
int wq_model;
char wq_param_detailed[256];
char wq_param_undetailed[256];
int sample_adaptive_offset_enable_flag; //样点自适应偏移启用与否
int adaptive_leveling_filter_enable_flag; //自适应调平滤光器启用标志
int secondary_transform_enable_flag; //二次变换的启用标志
u8 position_based_transform_enable_flag;
u8 library_picture_enable_flag;
u8 delta_qp_flag;
u8 chroma_format;
u8 encoding_precision;
#if HLS_RPL
COM_RPL rpls_l0[MAX_NUM_RPLS];
COM_RPL rpls_l1[MAX_NUM_RPLS];
int rpls_l0_cfg_num;
int rpls_l1_cfg_num;
#endif
#if PATCH
int patch_stable;
int cross_patch_loop_filter;
int patch_uniform;
int patch_ref_colocated;
int patch_width_in_lcu;
int patch_height_in_lcu;
int patch_columns;
int patch_rows;
int patch_column_width[64];
int patch_row_height[128];
#endif
#if LIBVC_ON
int qp_offset_libpic;
#endif
int sub_sample_ratio;
int frames_to_be_encoded;
u8 ctu_size; //最大编码单元的size
u8 min_cu_size; //编码单元最小size
u8 max_part_ratio;
u8 max_split_times; //最多可分割次数
u8 min_qt_size;
u8 max_bt_size;
u8 max_eqt_size;
u8 max_dt_size;
int qp_offset_cb;
int qp_offset_cr;
int qp_offset_adp;
int bit_depth;
} ENC_PARAM;
ENC_SBAC:稍后再议
typedef struct _ENC_SBAC
{
u32 range;
u32 code;
int left_bits;
u32 stacked_ff;
u32 pending_byte;
u32 is_pending_byte;
COM_SBAC_CTX ctx;
u32 bitcounter;
u8 is_bitcount;
} ENC_SBAC;
ENC_CORE:编码过程中使用的核心信息
/*****************************************************************************
* CORE information used for encoding process.在编码过程中使用的核心信息
*
* The variables in this structure are very often used in encoding process.编码过程中使用非常广泛
*****************************************************************************/
typedef struct _ENC_CORE
{
/* mode decision structure 模式选择结构 */
COM_MODE mod_info_best; //最好的模式信息
COM_MODE mod_info_curr; //当前模式信息
#if TB_SPLIT_EXT
COM_MODE mod_info_save;
//intra rdo copy the current best info directly into core->mod_info_best; need an internal pb_part for intra
//帧内rdo将当前最好的信息直接复制到core->mod_info_best,需要一个内部的帧内pb部分
int best_pb_part_intra; //帧内最好的pb部分
int best_tb_part_intra; //帧内最好的tb部分
#endif
/* coefficient buffer of current CU 当前CU的系数缓冲器*/
s16 coef[N_C][MAX_CU_DIM];
/* CU data for RDO 用来计算RDO的CU数据们 */
ENC_CU_DATA cu_data_best[MAX_CU_DEPTH][MAX_CU_DEPTH];
ENC_CU_DATA cu_data_temp[MAX_CU_DEPTH][MAX_CU_DEPTH];
/* temporary coefficient buffer 暂时的系数缓冲区 */
s16 ctmp[N_C][MAX_CU_DIM];
/* neighbor pixel buffer for intra prediction 帧内预测的相邻像素缓冲区 */
pel nb[N_C][N_REF][MAX_CU_SIZE * 3];
/* current encoding LCU number 当前正在编码的LCU的编号 */
int lcu_num;
/* QP for luma of current encoding CU 当前正在编码的CU块的亮度QP */
int qp_y;
/* QP for chroma of current encoding CU 当前正在编码的CU块的色度QP */
int qp_u;
int qp_v;
/* X address of current LCU 当前正在编码的LCU的x值 */
int x_lcu;
/* Y address of current LCU 当前正在编码的LCU的y值 */
int y_lcu;
/* X address of current CU in SCU unit SCU单元中当前CU的x值 */
int x_scu;
/* Y address of current CU in SCU unit SCU单元中当前CU的y值 */
int y_scu;
/* left pel position of current LCU 当前LCU的left像素的位置 */
int x_pel;
/* top pel position of current LCU 当前LCU的top像素的位置 */
int y_pel;
/* CU position in current LCU in SCU unit LCU中SCU中CU的位置*/
int cup;
/* CU depth CU的深度*/
int cud;
/* skip flag for MODE_INTER 帧间模式是否是skip的标志 */
u8 skip_flag;
/* split flag for Qt_split_flag Qt_split_flag的分割标志*/
u8 split_flag;
/* platform specific data, if needed 如果需要的话,平台特定的数据 */
void *pf;
/* bitstream structure for RDO RDO的比特流结构 */
COM_BSW bs_temp;
/* SBAC structure for full RDO 完整的RDO的SBAC结构 */
ENC_SBAC s_curr_best[MAX_CU_DEPTH][MAX_CU_DEPTH];
ENC_SBAC s_next_best[MAX_CU_DEPTH][MAX_CU_DEPTH];
ENC_SBAC s_temp_best;
ENC_SBAC s_temp_run;
ENC_SBAC s_temp_prev_comp_best;
ENC_SBAC s_temp_prev_comp_run;
#if TB_SPLIT_EXT
ENC_SBAC s_temp_pb_part_best;
#endif
ENC_SBAC s_curr_before_split[MAX_CU_DEPTH][MAX_CU_DEPTH];
ENC_BEF_DATA bef_data[MAX_CU_DEPTH][MAX_CU_DEPTH][MAX_CU_CNT_IN_LCU];
#if TR_SAVE_LOAD
u8 best_tb_part_hist;
#endif
#if TR_EARLY_TERMINATE
s64 dist_pred_luma;
#endif
ENC_SBAC s_sao_init, s_sao_cur_blk, s_sao_next_blk;
ENC_SBAC s_sao_cur_type, s_sao_next_type;
ENC_SBAC s_sao_cur_mergetype, s_sao_next_mergetype;
ENC_SBAC s_alf_cu_ctr;
ENC_SBAC s_alf_initial;
double cost_best;
u32 inter_satd;
s32 dist_cu;
s32 dist_cu_best; //dist of the best intra mode (note: only updated in intra coding now)最佳帧内模式的距离【仅在帧内编码中有】
// for storing the update-to-date motion list 用于存储最新的运动列表
COM_MOTION motion_cands[ALLOWED_HMVP_NUM];
s8 cnt_hmvp_cands;
#if EXT_AMVR_HMVP
u8 skip_mvps_check;
#endif
} ENC_CORE;
main()函数
mode_coding_tree()
mode_coding_tree()参数整理
最开始的参数由来:
上面这个赋值出现在一帧图像的开始,也就是说,这个时候依然是以帧为单位的。
mode_coding_unit()
copy_cu_data()
调用场景:
copy_cu_data(&ctx->map_cu_data[core->lcu_num], &core->cu_data_best[ctx->info.log2_max_cuwh - 2][ctx->info.log2_max_cuwh - 2], 0, 0, ctx->info.log2_max_cuwh, ctx->info.log2_max_cuwh, ctx->info.log2_max_cuwh, 0, TREE_LC);
从调用方式来看,copy_cu_data()就是需要保存best的那种划分模式下的各种信息。
static int copy_cu_data(ENC_CU_DATA *dst, ENC_CU_DATA *src, int x, int y, int cu_width_log2, int cu_height_log2, int log2_cus, int cud, u8 tree_status)
{
int i, j, k;
int cu_width, cu_height, cus;
int cuw_scu, cuh_scu, cus_scu;
int cx, cy;
int size, idx_dst, idx_src;
cx = x >> MIN_CU_LOG2;
cy = y >> MIN_CU_LOG2;
cu_width = 1 << cu_width_log2;
cu_height = 1 << cu_height_log2;
cus = 1 << log2_cus;
cuw_scu = 1 << (cu_width_log2 - MIN_CU_LOG2);
cuh_scu = 1 << (cu_height_log2 - MIN_CU_LOG2);
cus_scu = 1 << (log2_cus - MIN_CU_LOG2);
assert(tree_status != TREE_C);
if (tree_status == TREE_C)
{
for (j = 0; j < cuh_scu; j++)
{
idx_dst = (cy + j) * cus_scu + cx;
idx_src = j * cuw_scu;
size = cuw_scu * sizeof(s8);
com_mcpy(dst->ipm[1] + idx_dst, src->ipm[1] + idx_src, size);
size = cuw_scu * sizeof(int);
assert(*(src->num_nz_coef[Y_C] + idx_src) == 0);
for (k = U_C; k < N_C; k++)
{
com_mcpy(dst->num_nz_coef[k] + idx_dst, src->num_nz_coef[k] + idx_src, size);
}
}
for (j = 0; j < cu_height >> 1; j++)
{
idx_dst = ((y >> 1) + j) * (cus >> 1) + (x >> 1);
idx_src = j * (cu_width >> 1);
size = (cu_width >> 1) * sizeof(s16);
com_mcpy(dst->coef[U_C] + idx_dst, src->coef[U_C] + idx_src, size);
com_mcpy(dst->coef[V_C] + idx_dst, src->coef[V_C] + idx_src, size);
size = (cu_width >> 1) * sizeof(pel);
com_mcpy(dst->reco[U_C] + idx_dst, src->reco[U_C] + idx_src, size);
com_mcpy(dst->reco[V_C] + idx_dst, src->reco[V_C] + idx_src, size);
}
return COM_OK;
}
for (j = 0; j < cuh_scu; j++)
{
idx_dst = (cy + j) * cus_scu + cx;
idx_src = j * cuw_scu;
size = cuw_scu * sizeof(s8);
for (k = cud; k < MAX_CU_DEPTH; k++)
{
for (i = 0; i < NUM_BLOCK_SHAPE; i++)
{
com_mcpy(dst->split_mode[k][i] + idx_dst, src->split_mode[k][i] + idx_src, size);
}
}
com_mcpy(dst->pred_mode + idx_dst, src->pred_mode + idx_src, size);
com_mcpy(dst->mpm[0] + idx_dst, src->mpm[0] + idx_src, size);
com_mcpy(dst->mpm[1] + idx_dst, src->mpm[1] + idx_src, size);
com_mcpy(dst->ipm[0] + idx_dst, src->ipm[0] + idx_src, size);
com_mcpy(dst->ipm[1] + idx_dst, src->ipm[1] + idx_src, size);
for (i = 0; i < 8; i++)
{
com_mcpy(dst->mpm_ext[i] + idx_dst, src->mpm_ext[i] + idx_src, size);
}
com_mcpy(dst->affine_flag + idx_dst, src->affine_flag + idx_src, size);
#if SMVD
com_mcpy( dst->smvd_flag + idx_dst, src->smvd_flag + idx_src, size );
#endif
com_mcpy(dst->depth + idx_dst, src->depth + idx_src, size);
size = cuw_scu * sizeof(u32);
com_mcpy(dst->map_scu + idx_dst, src->map_scu + idx_src, size);
com_mcpy(dst->map_cu_mode + idx_dst, src->map_cu_mode + idx_src, size);
#if TB_SPLIT_EXT
com_mcpy(dst->map_pb_tb_part + idx_dst, src->map_pb_tb_part + idx_src, size);
#endif
size = cuw_scu * sizeof(u8) * REFP_NUM;
com_mcpy(*(dst->refi + idx_dst), *(src->refi + idx_src), size);
size = cuw_scu * sizeof(u8);
com_mcpy(dst->umve_flag + idx_dst, src->umve_flag + idx_src, size);
com_mcpy(dst->umve_idx + idx_dst, src->umve_idx + idx_src, size);
com_mcpy(dst->skip_idx + idx_dst, src->skip_idx + idx_src, size);
size = cuw_scu * sizeof(u8);
com_mcpy(dst->mvr_idx + idx_dst, src->mvr_idx + idx_src, size);
#if EXT_AMVR_HMVP
size = cuw_scu * sizeof(u8);
com_mcpy(dst->mvp_from_hmvp_flag + idx_dst, src->mvp_from_hmvp_flag + idx_src, size);
#endif
size = cuw_scu * sizeof(u8);
com_mcpy(dst->ipf_flag + idx_dst, src->ipf_flag + idx_src, size);
size = cuw_scu * sizeof(s16) * REFP_NUM * MV_D;
com_mcpy(dst->mv + idx_dst, src->mv + idx_src, size);
com_mcpy(dst->mvd + idx_dst, src->mvd + idx_src, size);
size = cuw_scu * sizeof(int);
for (k = 0; k < N_C; k++)
{
com_mcpy(dst->num_nz_coef[k] + idx_dst, src->num_nz_coef[k] + idx_src, size);
}
#if TB_SPLIT_EXT
com_mcpy(dst->pb_part + idx_dst, src->pb_part + idx_src, size);
com_mcpy(dst->tb_part + idx_dst, src->tb_part + idx_src, size);
#endif
}
for (j = 0; j < cu_height; j++)
{
idx_dst = (y + j) * cus + x;
idx_src = j * cu_width;
size = cu_width * sizeof(s16);
com_mcpy(dst->coef[Y_C] + idx_dst, src->coef[Y_C] + idx_src, size);
size = cu_width * sizeof(pel);
com_mcpy(dst->reco[Y_C] + idx_dst, src->reco[Y_C] + idx_src, size);
}
for (j = 0; j < cu_height >> 1; j++)
{
idx_dst = ((y >> 1) + j) * (cus >> 1) + (x >> 1);
idx_src = j * (cu_width >> 1);
size = (cu_width >> 1) * sizeof(s16);
com_mcpy(dst->coef[U_C] + idx_dst, src->coef[U_C] + idx_src, size);
com_mcpy(dst->coef[V_C] + idx_dst, src->coef[V_C] + idx_src, size);
size = (cu_width >> 1) * sizeof(pel);
com_mcpy(dst->reco[U_C] + idx_dst, src->reco[U_C] + idx_src, size);
com_mcpy(dst->reco[V_C] + idx_dst, src->reco[V_C] + idx_src, size);
}
return COM_OK;
}
diff_16b_32nx4n_simd()
调用场景:
enc_diff_16b(cu_width_log2, cu_height_log2, buf, pred[Y_C], stride, cu_width, cu_width, diff[Y_C]);
static void diff_16b_32nx4n_simd(int w, int h, void * src1, void * src2, int s_src1, int s_src2, int s_diff, s16 * diff)
{
s16 * s1;
s16 * s2;
int i, j;
__m128i m01, m02, m03, m04, m05, m06, m07, m08, m09, m10, m11, m12;
s1 = (s16 *)src1;
s2 = (s16 *)src2;
for(i = 0; i < (h>>2); i++)
{
for(j = 0; j < (w>>5); j++)
{
SSE_DIFF_16B_8PEL(s1, s2, diff, m01, m02, m03);
SSE_DIFF_16B_8PEL(s1+s_src1, s2+s_src2, diff+s_diff, m04, m05, m06);
SSE_DIFF_16B_8PEL(s1+s_src1*2, s2+s_src2*2, diff+s_diff*2, m07, m08, m09);
SSE_DIFF_16B_8PEL(s1+s_src1*3, s2+s_src2*3, diff+s_diff*3, m10, m11, m12);
s1 += 8;
s2 += 8;
diff+= 8;
SSE_DIFF_16B_8PEL(s1, s2, diff, m01, m02, m03);
SSE_DIFF_16B_8PEL(s1+s_src1, s2+s_src2, diff+s_diff, m04, m05, m06);
SSE_DIFF_16B_8PEL(s1+s_src1*2, s2+s_src2*2, diff+s_diff*2, m07, m08, m09);
SSE_DIFF_16B_8PEL(s1+s_src1*3, s2+s_src2*3, diff+s_diff*3, m10, m11, m12);
s1 += 8;
s2 += 8;
diff+= 8;
SSE_DIFF_16B_8PEL(s1, s2, diff, m01, m02, m03);
SSE_DIFF_16B_8PEL(s1+s_src1, s2+s_src2, diff+s_diff, m04, m05, m06);
SSE_DIFF_16B_8PEL(s1+s_src1*2, s2+s_src2*2, diff+s_diff*2, m07, m08, m09);
SSE_DIFF_16B_8PEL(s1+s_src1*3, s2+s_src2*3, diff+s_diff*3, m10, m11, m12);
s1 += 8;
s2 += 8;
diff+= 8;
SSE_DIFF_16B_8PEL(s1, s2, diff, m01, m02, m03);
SSE_DIFF_16B_8PEL(s1+s_src1, s2+s_src2, diff+s_diff, m04, m05, m06);
SSE_DIFF_16B_8PEL(s1+s_src1*2, s2+s_src2*2, diff+s_diff*2, m07, m08, m09);
SSE_DIFF_16B_8PEL(s1+s_src1*3, s2+s_src2*3, diff+s_diff*3, m10, m11, m12);
s1 += 8;
s2 += 8;
diff+= 8;
}
s1 += ((s_src1<<2) - ((w>>5)<<5));
s2 += ((s_src2<<2) - ((w>>5)<<5));
diff += ((s_diff<<2) - ((w>>5)<<5));
}
}