系列文章目录
【x264编码器】章节1——x264编码流程及基于x264的编码器demo
【x264编码器】章节2——x264的lookahead流程分析
【x265编码器】章节2——编码流程及基于x265的编码器demo
目录
1.帧间编码Analysis::compressInterCU_rd0_4
2.帧间merge模式代价计算checkMerge2Nx2N_rd0_4
3.获取merge模式候选列表getInterMergeCandidates
4.运动补偿Predict::motionCompensation
5.skip模式计算率失真代价Search::encodeResAndCalcRdSkipCU
6.merge模式计算率失真代价Search::encodeResAndCalcRdInterCU
7.帧间预测Analysis::checkInter_rd0_4
8.帧间预测Analysis::checkInter_rd0_4
9.帧间预测搜索Search::predInterSearch
10.运动估计MotionEstimate::motionEstimate
一、帧间预测流程
帧间预测流程总体流程如下图,黄色部分即为帧间预测流程的过程,下面流程图也包含了x265其他模块的流程。
x265完整的流程框架如下:
二、各模块代码分析
1.帧间编码Analysis::compressInterCU_rd0_4
流程上是:
1.先进行merge和skip模式的RD计算,如果不是skip模式最优,则继续往下执行;
2.跳到步骤1递归继续往下划分,一直划分到最小块8x8;
3.遍历2Nx2N、2NxN、Nx2N、2NxnD、2NxnU、nRx2N、nLx2N、帧间的帧内预测模式,其中矩形和非对称划分,默认配置下,x265不需要遍历;
4.返回最优模式;
代码分析如下:
SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
{
if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))
return compressInterCU_rd5_6(parentCTU, cuGeom, qp);
uint32_t depth = cuGeom.depth;
uint32_t cuAddr = parentCTU.m_cuAddr;
ModeDepth& md = m_modeDepth[depth];
if (m_param->searchMethod == X265_SEA)
{ //根据 Inter CU 的预测方向(单向或双向),以及参考帧索引数目,将 m_modeDepth[depth].fencYuv.m_integral 数组填充为对应的参考帧的积分图像数据
int numPredDir = m_slice->isInterP() ? 1 : 2;
int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
for (int list = 0; list < numPredDir; list++)
for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
}
//用于存储重构图像数据
PicYuv& reconPic = *m_frame->m_reconPic;
SplitData splitCUData;
bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
{ //将 md.bestMode 设置为 NULL,用于存储最佳的模式
md.bestMode = NULL;
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);//使用 topSkipMinDepth 函数计算 minDepth,表示顶部跳过的最小深度
bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
bool skipModes = false; //用于控制是否跳过模式分析/* Skip any remaining mode analyses at current depth */
bool skipRecursion = false; //用于控制是否跳过递归/* Skip recursion */
bool splitIntra = true;
bool skipRectAmp = false;
bool chooseMerge = false;
bool bCtuInfoCheck = false;
int sameContentRef = 0;
if (m_evaluateInter)
{
if (m_refineLevel == 2)
{
if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
skipModes = true;
if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
skipRectAmp = true;
}
mightSplit &= false;
minDepth = depth;
}
if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
//创建了一个名为 splitData 的长度为 4 的 SplitData 数组,并通过调用 initSplitCUData 函数初始化每个元素
SplitData splitData[4];
splitData[0].initSplitCUData();
splitData[1].initSplitCUData();
splitData[2].initSplitCUData();
splitData[3].initSplitCUData();
// avoid uninitialize value in below reference
if (m_param->limitModes)
{
md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
md.pred[PRED_2Nx2N].sa8dCost = 0;
}
if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
{
if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
{
mightNotSplit &= bDecidedDepth;
bCtuInfoCheck = skipRecursion = false;
skipModes = true;
}
else if (mightNotSplit && bDecidedDepth)
{
if (m_additionalCtuInfo[cuGeom.absPartIdx])
{
bCtuInfoCheck = skipRecursion = true;
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
if (!sameContentRef)
{
if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
{
qp -= int32_t(0.04 * qp);
setLambdaFromQP(parentCTU, qp);
}
if (m_param->bCTUInfo & 4)
skipModes = false;
}
if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
{
if (m_param->rdLevel)
skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
if ((m_param->bCTUInfo & 4) && sameContentRef)
skipModes = md.bestMode && true;
}
}
else
{ //进行初始化
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);//对 md.pred[PRED_SKIP] 和 md.pred[PRED_MERGE] 进行 2Nx2N merge模式的分析
if (m_param->rdLevel)
skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
}
mightSplit &= !bDecidedDepth;
}
}
if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10))
{
if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
{
if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
{
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
if (m_param->rdLevel)
skipModes = m_param->bEnableEarlySkip && md.bestMode;
}
if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
{
if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA && m_reuseModes[cuGeom.absPartIdx] != 4)
{
skipRectAmp = true && !!md.bestMode;
chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
}
}
}
}
if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
{
if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
{
if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
{
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
if (m_param->rdLevel)
skipModes = m_param->bEnableEarlySkip && md.bestMode;
}
}
}//如果上述没有设置skip模式,则评估可能提前退出的合并模式
/* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
/* TODO: Re-evaluate if analysis load/save still works */
{
/* Compute Merge Cost */
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
if (m_param->rdLevel)
skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
&& md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
}//检查是否满足跳过递归的条件。这些条件包括:存在最佳模式(bestMode)、启用递归跳过模式(recursionSkipMode)、没有进行CTU信息检查(bCtuInfoCheck)以及不满足特定的分析类型条件
if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
{
skipRecursion = md.bestMode->cu.isSkipped(0);
if (mightSplit && !skipRecursion)//如果满足可能分割的条件(mightSplit)且不应跳过递归(!skipRecursion),则根据不同的情况进行判断
{ //如果深度达到最小深度(minDepth)且递归跳过模式为基于RDCOST的跳过(RDCOST_BASED_RSKIP),则根据递归深度检查(recursionDepthCheck)和复杂度检查(complexityCheckCU)来确定是否应该跳过递归
if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
{
if (depth)
skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
skipRecursion = complexityCheckCU(*md.bestMode);
}//如果CU的CU尺寸(log2CUSize)大于等于最大CU尺寸减1(MAX_LOG2_CU_SIZE - 1)且递归跳过模式为基于边缘的跳过(EDGE_BASED_RSKIP),则根据复杂度检查来确定是否应该跳过递归
else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
{
skipRecursion = complexityCheckCU(*md.bestMode);
}
}
}//如果分析类型为AVC_INFO、存在最佳模式(bestMode)、CU分区数小于等于16,并且分析加载重用级别为7,这些条件都满足,将设置跳过递归(skipRecursion)为true
if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
skipRecursion = true;
/* Step 2. Evaluate each of the 4 split sub-blocks in series */
if (mightSplit && !skipRecursion)
{ //初始化分割预测模式(splitPred)和分割CU数据(splitCU)
if (bCtuInfoCheck && m_param->bCTUInfo & 2)
qp = int((1 / 0.96) * qp + 0.5);
Mode* splitPred = &md.pred[PRED_SPLIT];
splitPred->initCosts();
CUData* splitCU = &splitPred->cu;
splitCU->initSubCU(parentCTU, cuGeom, qp);
//初始化下一个深度(nextDepth)和对应的模式深度(nd)
uint32_t nextDepth = depth + 1;
ModeDepth& nd = m_modeDepth[nextDepth];
invalidateContexts(nextDepth);
Entropy* nextContext = &m_rqt[depth].cur;
int nextQP = qp;//根据情况更新下一个QP(nextQP)
splitIntra = false;
//进行子块的压缩编码(compressInterCU_rd0_4),并保存相应的数据
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
if (childGeom.flags & CUGeom::PRESENT)
{
m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
m_rqt[nextDepth].cur.load(*nextContext);
if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
//判断子块最佳模式是否为内部预测(intra),并更新splitIntra标志,将最佳模式的数据复制到相应的子块CU和预测模式(splitCU和splitPred)中
// Save best CU and pred data for this sub CU
splitIntra |= nd.bestMode->cu.isIntra(0);
splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
//根据编码参数(m_param)和RD级别(rdLevel)更新上下文(nextContext)
if (m_param->rdLevel)
nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
else
nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
if (m_param->rdLevel > 1)
nextContext = &nd.bestMode->contexts;
}
else//如果子块不存在(flags & CUGeom::PRESENT),则设置子块为空
splitCU->setEmptyPart(childGeom, subPartIdx);
}//循环结束后,存储最终的上下文(nextContext)到分割预测模式(splitPred)中
nextContext->store(splitPred->contexts);
//后,根据可能不进行分割(mightNotSplit)和RD级别(rdLevel)的情况,分别计算分割标志的成本(addSplitFlagCost)或更新模式成本(updateModeCost)
if (mightNotSplit)
addSplitFlagCost(*splitPred, cuGeom.depth);
else if (m_param->rdLevel > 1)
updateModeCost(*splitPred);
else
splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
}
/* If analysis mode is simple do not Evaluate other modes */
if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
{
if (m_slice->m_sliceType == P_SLICE)
{
if (m_checkMergeAndSkipOnly[0])
skipModes = true;
}
else
{
if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
skipModes = true;
}
}
/* Split CUs
* 0 1
* 2 3 *///定义了一个名为allSplitRefs的变量,它存储了四个子块的分割参考值(splitRefs)。这些参考值是由之前的代码段计算得到的
uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
/* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
{ //如果使用了DQP(bUseDQP)并且当前深度小于等于最大CU DQP深度(maxCuDQPDepth),同时最大CU DQP深度不为0,则根据父CTU和QP设置Lambda
if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
setLambdaFromQP(parentCTU, qp);
if (!skipModes)
{ //定义了一个名为refMasks的数组,用于存储分割参考值,使用checkInter_rd0_4函数对2Nx2N模式进行评估,计算运动矢量成本和其他相关数据
uint32_t refMasks[2];
refMasks[0] = allSplitRefs;//初始化子块CU
md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
//如果限制参考图像数(limitReferences)的条件为真,将最佳参考索引(refMask)赋值给所有子块的分割参考值
if (m_param->limitReferences & X265_REF_LIMIT_CU)
{
CUData& cu = md.pred[PRED_2Nx2N].cu;
uint32_t refMask = cu.getBestRefIdx(0);
allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
}
//如果切片类型为B_SLICE
if (m_slice->m_sliceType == B_SLICE)
{ //初始化双向预测模式的子块CU(md.pred[PRED_BIDIR].cu),对双向预测模式进行评估,计算运动矢量成本和其他相关数据
md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
}
//将bestInter指针指向md.pred[PRED_2Nx2N]
Mode *bestInter = &md.pred[PRED_2Nx2N];
if (!skipRectAmp)
{
if (m_param->bEnableRectInter)
{ //计算分割成本(splitCost)
uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
uint32_t threshold_2NxN, threshold_Nx2N;
//根据切片类型设置2NxN和Nx2N的阈值
if (m_slice->m_sliceType == P_SLICE)
{
threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
}
else
{
threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
+ splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
+ splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
}
//如果2NxN优先且splitCost小于md.pred[PRED_2Nx2N]的成本加上threshold_2NxN
int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
{ //更新参考掩码
refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);//初始化子块CU
checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxN];//如果md.pred[PRED_2NxN]的sa8dCost小于bestInter的sa8dCost,则将bestInter指针指向md.pred[PRED_2NxN]
}
//同上,Nx2N模式
if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
{
refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_Nx2N];
}
//同上
if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
{
refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxN];
}
}
//于判断当前深度是否可以进行CU分割
if (m_slice->m_sps->maxAMPDepth > depth)
{ //计算四个子块的SA8D成本
uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
//根据切片类型(m_slice->m_sliceType)设置不同的阈值(threshold)
if (m_slice->m_sliceType == P_SLICE)
{
threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
}
else
{
threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
+ splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
+ splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
+ splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
+ splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
}
//据最佳帧间预测模式(bestInter)的CU分割大小(bestInter->cu.m_partSize[0])判断是否进行水平和垂直方向的分割
bool bHor = false, bVer = false;
if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
bHor = true;//如果CU分割大小是SIZE_2NxN,则进行水平方向的分割(bHor设置为true)
else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
bVer = true;//如果CU分割大小是SIZE_Nx2N,则进行垂直方向的分割(bVer设置为true)
else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
md.bestMode && md.bestMode->cu.getQtRootCbf(0))
{
bHor = true;
bVer = true;
}
if (bHor)
{
int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
{
refMasks[0] = allSplitRefs; /* 75% top */
refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnD];
}
if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
{
refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
refMasks[1] = allSplitRefs; /* 75% bot */
md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnU];
}
if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
{
refMasks[0] = allSplitRefs; /* 75% top */
refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnD];
}
}
if (bVer)
{
int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
{
refMasks[0] = allSplitRefs; /* 75% left */
refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nRx2N];
}
if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
{
refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left */
refMasks[1] = allSplitRefs; /* 75% right */
md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nLx2N];
}
if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
{
refMasks[0] = allSplitRefs; /* 75% left */
refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nRx2N];
}
}
}
}//该条件表示是否尝试进行帧内预测,不为B帧,不为64x64
bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
if (m_param->rdLevel >= 3)
{ //判断是否需要进行帧间预测的亮度分量
/* Calculate RD cost of best inter option */
if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
{
uint32_t numPU = bestInter->cu.getNumPartInter(0);
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
{ //则对bestInter中的每个PU(预测单元)进行运动补偿
PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
}
}
//代码判断是否选择合并模式(chooseMerge)
if (!chooseMerge)
{ //计算bestInter的残差并计算RD成本
encodeResAndCalcRdInterCU(*bestInter, cuGeom);
checkBestMode(*bestInter, depth);//然后调用checkBestMode函数更新最佳模式,如果选择合并模式,则跳过这部分处理
//代码判断是否存在双向预测模式(BIDIR)并且双向预测的SA8D成本在最佳帧间预测的17/16以内
/* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
{ //每个PU进行运动补偿,并计算残差和RD成本。然后调用checkBestMode函数更新最佳模式
uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
{
PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
}
encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
checkBestMode(md.pred[PRED_BIDIR], depth);
}
}
//代码判断是否尝试进行帧内预测(bTryIntra为true)并且最佳帧间预测的SA8D成本为最大值(表示没有可行的帧间预测)
if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
md.bestMode->sa8dCost == MAX_INT64)
{ //如果不限制参考帧数或者进行了CU分割(splitIntra),则调用initSubCU函数初始化md.pred[PRED_INTRA]的子CU,并进行帧内帧间混合预测的处理(checkIntraInInter、encodeIntraInInter),最后调用checkBestMode函数更新最佳模式(md.pred[PRED_INTRA]
if (!m_param->limitReferences || splitIntra)
{
ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
checkBestMode(md.pred[PRED_INTRA], depth);
}
else
{
ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
}
}
}
else
{
/* SA8D choice between merge/skip, inter, bidir, and intra */
if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
md.bestMode = bestInter;
if (m_slice->m_sliceType == B_SLICE &&
md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
md.bestMode = &md.pred[PRED_BIDIR];
if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
{
if (!m_param->limitReferences || splitIntra)
{
ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
md.bestMode = &md.pred[PRED_INTRA];
}
else
{
ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
}
}
/* finally code the best mode selected by SA8D costs:
* RD level 2 - fully encode the best mode
* RD level 1 - generate recon pixels
* RD level 0 - generate chroma prediction */
if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
{
/* prediction already generated for this CU, and if rd level
* is not 0, it is already fully encoded */
}
else if (md.bestMode->cu.isInter(0))
{
uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
if (m_csp != X265_CSP_I400)
{
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
{
PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
}
}
if (m_param->rdLevel == 2)
encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
else if (m_param->rdLevel == 1)
{
/* generate recon pixels with no rate distortion considerations */
CUData& cu = md.bestMode->cu;
uint32_t tuDepthRange[2];
cu.getInterTUQtDepthRange(tuDepthRange, 0);
m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
if (cu.getQtRootCbf(0))
md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
else
{
md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
cu.setPredModeSubParts(MODE_SKIP);
}
}
}
else
{
if (m_param->rdLevel == 2)
encodeIntraInInter(*md.bestMode, cuGeom);
else if (m_param->rdLevel == 1)
{
/* generate recon pixels with no rate distortion considerations */
CUData& cu = md.bestMode->cu;
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
if (m_csp != X265_CSP_I400)
{
getBestIntraModeChroma(*md.bestMode, cuGeom);
residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
}
md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
}
}
}
} // !earlyskip
if (m_bTryLossless)
tryLossless(cuGeom);
if (mightSplit)
addSplitFlagCost(*md.bestMode, cuGeom.depth);
}
//表示可能进行CU分割,并且没有跳过递归
if (mightSplit && !skipRecursion)
{
Mode* splitPred = &md.pred[PRED_SPLIT];
if (!md.bestMode)
md.bestMode = splitPred;
else if (m_param->rdLevel > 1)//比较分割模式和最优模式
checkBestMode(*splitPred, cuGeom.depth);
else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
md.bestMode = splitPred;
checkDQPForSplitPred(*md.bestMode, cuGeom);
}
//初始化splitCUData
/* determine which motion references the parent CU should search */
splitCUData.initSplitCUData();
if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
{ //表示分割模式是最佳模式,将splitRefs设置为allSplitRefs
if (md.bestMode == &md.pred[PRED_SPLIT])
splitCUData.splitRefs = allSplitRefs;
else
{ //根据最佳合并/帧间模式确定参考帧,如果是帧内预测,使用2Nx2N帧间参考帧,遍历每个PU,将其最佳参考帧索引添加到splitCUData.splitRefs中
/* use best merge/inter mode, in case of intra use 2Nx2N inter references */
CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
uint32_t numPU = cu.getNumPartInter(0);
for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
}
}
if (m_param->limitModes)
{
splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
}
//如果满足条件mightNotSplit并且最佳模式的CU被跳过(被标记为跳过),则对当前CTU的编码统计信息进行更新
if (mightNotSplit && md.bestMode->cu.isSkipped(0))
{
FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
cuStat.count[depth] += 1;
cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
}
//将最佳模式的CU数据和重建图像数据拷贝到encData和reconPic中
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (m_param->rdLevel)
md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
{
if (mightNotSplit)
{ //如果mightNotSplit为true,获取最佳模式的CU的encData中的CTU数据,并对其进行更新,记录最大的TU深度
CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
int8_t maxTUDepth = -1;
for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
}
}
}
else
{
if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
{
qprdRefine(parentCTU, cuGeom, qp, qp);
SplitData splitData[4];
splitData[0].initSplitCUData();
splitData[1].initSplitCUData();
splitData[2].initSplitCUData();
splitData[3].initSplitCUData();
uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
splitCUData.initSplitCUData();
if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
{
if (md.bestMode == &md.pred[PRED_SPLIT])
splitCUData.splitRefs = allSplitRefs;
else
{
/* use best merge/inter mode, in case of intra use 2Nx2N inter references */
CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
uint32_t numPU = cu.getNumPartInter(0);
for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
}
}
if (m_param->limitModes)
{
splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
}
}
}
return splitCUData;
}
2.帧间merge模式代价计算checkMerge2Nx2N_rd0_4
流程上是:
1.选取3个候选项;(x265默认配置参数,常规应该为5个)
2.遍历进行运动补偿(仅做亮度),根据sa8d计算RD,选取最优的候选项;
3.对最优候选项追加色度的运动补偿;
4.计算skip模式的RD,此时会涉及计算残差、熵编码、计算bit数、得到最后的RD值;
5.类似上一步,计算merge模式的RD值;
6.对比skip模式和merge模式,取最优的模式并返回。代码分析如下:
//如果找到有效的合并候选项,则设置md.bestMode,否则为NULL
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
{ //获取 cuGeom 的深度
uint32_t depth = cuGeom.depth;
ModeDepth& md = m_modeDepth[depth];
Yuv *fencYuv = &md.fencYuv;
//初始化两个 Mode 实例,命名为 tempPred 和 bestPred,并将它们分别指向 merge 和 skip
/* Note that these two Mode instances are named MERGE and SKIP but they may
* hold the reverse when the function returns. We toggle between the two modes */
Mode* tempPred = &merge;
Mode* bestPred = &skip;
X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
tempPred->initCosts();
tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
tempPred->cu.setPredModeSubParts(MODE_INTER);
tempPred->cu.m_mergeFlag[0] = true;
bestPred->initCosts();
bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
bestPred->cu.setPredModeSubParts(MODE_INTER);
bestPred->cu.m_mergeFlag[0] = true;
//用于存储merge候选项的运动矢量
MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
uint8_t candDir[MRG_MAX_NUM_CANDS];//用于存储合并候选项的方向
uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
PredictionUnit pu(merge.cu, cuGeom, 0);
bestPred->sa8dCost = MAX_INT64;
int bestSadCand = -1;
int sizeIdx = cuGeom.log2CUSize - 2;
int safeX, maxSafeMv;
if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
{
safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
}
for (uint32_t i = 0; i < numMergeCand; ++i)
{ //启用了并行处理(m_bFrameParallel),则进行并行切片的边界检查
if (m_bFrameParallel)
{
// Parallel slices bound check
if (m_param->maxSlices > 1)
{
// NOTE: First row in slice can't negative
if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
continue;
// Last row in slice can't reference beyond bound since it is another slice area
// TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
continue;
}
if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
continue;
}
if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
candMvField[i][0].mv.x > maxSafeMv)
// skip merge candidates which reference beyond safe reference area
continue;
//将当前候选项的相关信息赋值给 tempPred->cu 结构体的相应成员变量
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
tempPred->cu.m_interDir[0] = candDir[i];
tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;//执行运动补偿,根据候选项的运动矢量对 tempPred->predYuv 进行预测
motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
//当前候选项的比特数,当前候选项的失真
tempPred->sa8dBits = getTUBits(i, numMergeCand);
tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
{ //计算色度平面的 SA8D 失真
tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
}//当前候选项的 RD 代价,将失真值和比特数作为参数
tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
//如果 tempPred->sa8dCost 小于 bestPred->sa8dCost,则更新 bestSadCand 的值为当前候选项的索引 i,并交换 tempPred 和 bestPred
if (tempPred->sa8dCost < bestPred->sa8dCost)
{ //通过上述循环迭代,找到了 RD 代价最小的候选项,并将其索引存储在 bestSadCand 中
bestSadCand = i;
std::swap(tempPred, bestPred);
}
}
//首先,代码检查bestSadCand是否小于0,如果是,则强制选择帧内编码或帧间编码。如果是帧内编码,直接返回
/* force mode decision to take inter or intra */
if (bestSadCand < 0)
return;
//接下来,对于选择的最佳模式,计算其色度通道的运动补偿。这一步是为了对色度通道进行运动补偿,以提高编码效率
/* calculate the motion compensation for chroma for the best mode selected */
if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
//如果设置了rdLevel(率失真优化级别),默认为3
if (m_param->rdLevel)
{ // 检查是否设置了无损编码(bLossless),如果是,则将bestPred的rdCost设置为最大值
if (m_param->bLossless)
bestPred->rdCost = MAX_INT64;
else//对bestPred进行编码,包括编码残差并计算率失真(RdSkip)的代价
encodeResAndCalcRdSkipCU(*bestPred);
/* Encode with residual 使用残差进行编码*///将tempPred的一些参数设置为与bestPred相同
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
tempPred->sa8dCost = bestPred->sa8dCost;
tempPred->sa8dBits = bestPred->sa8dBits;
tempPred->predYuv.copyFromYuv(bestPred->predYuv);
//使用encodeResAndCalcRdInterCU函数对tempPred进行编码和率失真计算
encodeResAndCalcRdInterCU(*tempPred, cuGeom);
//根据编码结果的率失真代价(rdCost)比较,选择最佳模式
md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
}
else
md.bestMode = bestPred;
//将最佳模式的运动矢量和参考图像索引广播给其他部分
/* broadcast sets of MV field data */
md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
checkDQP(*md.bestMode, cuGeom);//调用checkDQP函数对最佳模式进行DQP(差分量化参数)检查
}
3.获取merge模式候选列表getInterMergeCandidates
函数功能是获取merge模式候选项,原理可以参考HEVC视频编解码标准简介中的merge模式流程,区别在于,x265默认配置下,只会获取3个候选项,而不是常规的5个候选项;
uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*candMvField)[2], uint8_t* candDir) const
{
uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
const bool isInterB = m_slice->isInterB();//检查当前切片是否为 InterB 类型
//获取最大合并候选项数量
const uint32_t maxNumMergeCand = m_slice->m_maxNumMergeCand;
//使用循环遍历合并候选项数组 candMvField,对其进行初始化。将每个候选项的运动矢量设置为零,参考索引设置为 REF_NOT_VALID
for (uint32_t i = 0; i < maxNumMergeCand; ++i)
{
candMvField[i][0].mv = 0;
candMvField[i][1].mv = 0;
candMvField[i][0].refIdx = REF_NOT_VALID;
candMvField[i][1].refIdx = REF_NOT_VALID;
}
//计算当前 PU 的左上角像素位置和尺寸
/* calculate the location of upper-left corner pixel and size of the current PU */
int xP, yP, nPSW, nPSH;
int cuSize = 1 << m_log2CUSize[0];
int partMode = m_partSize[0];
//通过查找预定义的分区表 partTable,根据当前 PU 的分区模式和 puIdx 获取尺寸信息
int tmp = partTable[partMode][puIdx][0];
nPSW = ((tmp >> 4) * cuSize) >> 2;//计算 nPSW 和 nPSH,分别表示当前 PU 的宽度和高度
nPSH = ((tmp & 0xF) * cuSize) >> 2;
//分别表示当前 PU 的左上角像素的 x 坐标和 y 坐标
tmp = partTable[partMode][puIdx][1];
xP = ((tmp >> 4) * cuSize) >> 2;
yP = ((tmp & 0xF) * cuSize) >> 2;
//初始化计数器 count 为 0
uint32_t count = 0;
//获取左下侧候选项
uint32_t partIdxLT, partIdxRT, partIdxLB = deriveLeftBottomIdx(puIdx);
PartSize curPS = (PartSize)m_partSize[absPartIdx];
// left 获取左侧候选项
uint32_t leftPartIdx = 0;
const CUData* cuLeft = getPULeft(leftPartIdx, partIdxLB);
bool isAvailableA1 = cuLeft &&//判断左侧 PU 是否可用,并满足一些特定条件
cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) &&
!(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) &&
cuLeft->isInter(leftPartIdx);
if (isAvailableA1)//如果左侧 PU 可用,将其mv存储在 candDir[count] 中
{
// get Inter Dir
candDir[count] = cuLeft->m_interDir[leftPartIdx];
// get Mv from Left
cuLeft->getMvField(cuLeft, leftPartIdx, 0, candMvField[count][0]);
if (isInterB)
cuLeft->getMvField(cuLeft, leftPartIdx, 1, candMvField[count][1]);
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
//调用 deriveLeftRightTopIdx 函数,根据当前 PU 的 puIdx 和左上角部分索引 partIdxLT、右上角部分索引 partIdxRT 计算左侧、右侧和顶部的部分索引
deriveLeftRightTopIdx(puIdx, partIdxLT, partIdxRT);
// above 当前 PU 的上侧合并候选项
uint32_t abovePartIdx = 0;
const CUData* cuAbove = getPUAbove(abovePartIdx, partIdxRT);
bool isAvailableB1 = cuAbove &&
cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) &&
!(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) &&
cuAbove->isInter(abovePartIdx);
if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx)))
{ //如果上侧 PU 可用,将其mv方向存储在 candDir[count] 中
// get Inter Dir
candDir[count] = cuAbove->m_interDir[abovePartIdx];
// get Mv from Left
cuAbove->getMvField(cuAbove, abovePartIdx, 0, candMvField[count][0]);
if (isInterB)
cuAbove->getMvField(cuAbove, abovePartIdx, 1, candMvField[count][1]);
//如果候选项数量达到最大值 maxNumMergeCand,则返回 maxNumMergeCand
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
// above right 计算当前 PU 的右上侧候选项
uint32_t aboveRightPartIdx = 0;
const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT);
bool isAvailableB0 = cuAboveRight &&
cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) &&
cuAboveRight->isInter(aboveRightPartIdx);
if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx)))
{ //如果右上侧 PU 可用,将其合并方向存储在 candDir[count] 中
// get Inter Dir
candDir[count] = cuAboveRight->m_interDir[aboveRightPartIdx];
// get Mv from Left
cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, candMvField[count][0]);
if (isInterB)
cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, candMvField[count][1]);
//如果候选项数量达到最大值 maxNumMergeCand,则返回 maxNumMergeCand
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
// left bottom 计算了当前 PU 的左下角合并候选项
uint32_t leftBottomPartIdx = 0;
const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB);
bool isAvailableA0 = cuLeftBottom &&
cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) &&
cuLeftBottom->isInter(leftBottomPartIdx);
if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx)))
{ //如果左下角 PU 可用,将其合并方向存储在 candDir[count] 中
// get Inter Dir
candDir[count] = cuLeftBottom->m_interDir[leftBottomPartIdx];
// get Mv from Left
cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, candMvField[count][0]);
if (isInterB)
cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, candMvField[count][1]);
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
// above left 计算当前 PU 的左上角合并候选项
if (count < 4)
{
uint32_t aboveLeftPartIdx = 0;
const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr);
bool isAvailableB2 = cuAboveLeft &&
cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) &&
cuAboveLeft->isInter(aboveLeftPartIdx);
if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx))
&& (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx)))
{ //如果左上角 PU 可用,将其合并方向存储在 candDir[count] 中
// get Inter Dir
candDir[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx];
// get Mv from Left
cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, candMvField[count][0]);
if (isInterB)
cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, candMvField[count][1]);
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
}//首先检查是否启用了时域运动矢量预测(Temporal MVP)功能
if (m_slice->m_sps->bTemporalMVPEnabled)
{ //调用 deriveRightBottomIdx 函数,根据当前 PU 的索引 puIdx 计算右下角的部分索引 partIdxRB
uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
MV colmv;
int ctuIdx = -1;
// image boundary check 对图像边界进行检查,确保右下角的坐标没有超出图像边界
if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples &&
m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
{ //根据图像边界检查的结果,计算右下角相邻 PU 的位置
uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
uint32_t numUnits = s_numPartInCUSize;
bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTU
bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row of CTU
//如果右下角不是 CTU 的最后一列和最后一行的单元格,则将右下角相邻 PU 的绝对地址 absPartAddr 设置为当前 PU 右下角相邻 PU 的地址,并设置 ctuIdx 为当前 CTU 的地址
if (bNotLastCol && bNotLastRow)
{
absPartAddr = g_rasterToZscan[absPartIdxRB + RASTER_SIZE + 1];
ctuIdx = m_cuAddr;
}
else if (bNotLastCol)
absPartAddr = g_rasterToZscan[(absPartIdxRB + 1) & (numUnits - 1)];
else if (bNotLastRow)
{ //如果右下角是 CTU 的最后一列但不是最后一行的单元格,则将 absPartAddr 设置为当前 PU 右侧相邻 PU 的地址,并设置 ctuIdx 为当前 CTU 的地址加一
absPartAddr = g_rasterToZscan[absPartIdxRB + RASTER_SIZE - numUnits + 1];
ctuIdx = m_cuAddr + 1;
}
else // is the right bottom corner of CTU 如果右下角是 CTU 的最后一个单元格,则将 absPartAddr 设置为零
absPartAddr = 0;
}
//根据是否为 InterB 类型的条件,确定最大参考图像列表数 maxList
int maxList = isInterB ? 2 : 1;
int dir = 0, refIdx = 0;
for (int list = 0; list < maxList; list++)
{ //判断是否存在相邻块的运动矢量
bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, list, ctuIdx, absPartAddr);
if (!bExistMV)
{
uint32_t partIdxCenter = deriveCenterIdx(puIdx);
bExistMV = getColMVP(colmv, refIdx, list, m_cuAddr, partIdxCenter);
}
if (bExistMV)
{ //如果相邻块的运动矢量存在,将其存储在 candMvField[count][list] 中,并设置 dir 的相应位为 1,表示该列表存在有效运动矢量
dir |= (1 << list);
candMvField[count][list].mv = colmv;
candMvField[count][list].refIdx = refIdx;
}
}
if (dir != 0)
{
candDir[count] = (uint8_t)dir;
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
}
if (isInterB)
{ //计算 cutoff,用于确定循环次数,初始化两个优先级列表 priorityList0 和 priorityList1,这些列表用于确定候选项的优先级顺序
const uint32_t cutoff = count * (count - 1);
uint32_t priorityList0 = 0xEDC984; // { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }
uint32_t priorityList1 = 0xB73621; // { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }
for (uint32_t idx = 0; idx < cutoff; idx++, priorityList0 >>= 2, priorityList1 >>= 2)
{
int i = priorityList0 & 3;
int j = priorityList1 & 3;
if ((candDir[i] & 0x1) && (candDir[j] & 0x2))
{ //如果满足条件,则从 cand[i] 和 cand[j] 获取运动矢量和参考索引
// get Mv from cand[i] and cand[j]
int refIdxL0 = candMvField[i][0].refIdx;
int refIdxL1 = candMvField[j][1].refIdx;
int refPOCL0 = m_slice->m_refPOCList[0][refIdxL0];
int refPOCL1 = m_slice->m_refPOCList[1][refIdxL1];
if (!(refPOCL0 == refPOCL1 && candMvField[i][0].mv == candMvField[j][1].mv))
{
candMvField[count][0].mv = candMvField[i][0].mv;
candMvField[count][0].refIdx = refIdxL0;
candMvField[count][1].mv = candMvField[j][1].mv;
candMvField[count][1].refIdx = refIdxL1;
candDir[count] = 3;
if (++count == maxNumMergeCand)
return maxNumMergeCand;
}
}
}
}
int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx[0], m_slice->m_numRefIdx[1]) : m_slice->m_numRefIdx[0];
int r = 0;
int refcnt = 0;
while (count < maxNumMergeCand)
{ //将方向设为 1,表示为单向合并,将当前候选项的运动矢量设置为零
candDir[count] = 1;
candMvField[count][0].mv.word = 0;
candMvField[count][0].refIdx = r;
if (isInterB)
{
candDir[count] = 3;
candMvField[count][1].mv.word = 0;
candMvField[count][1].refIdx = r;
}
count++;
if (refcnt == numRefIdx - 1)
r = 0;
else
{
++r;
++refcnt;
}
}
return count;
}
4.运动补偿Predict::motionCompensation
运动补偿主要是根据MV运动矢量指向的参考块,构建预测块的流程,对应代码分析如下;
void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
{
int refIdx0 = cu.m_refIdx[0][pu.puAbsPartIdx];
int refIdx1 = cu.m_refIdx[1][pu.puAbsPartIdx];
if (cu.m_slice->isInterP())
{
/* P Slice */
WeightValues wv0[3];
//检查参考索引 refIdx0 是否有效
X265_CHECK(refIdx0 >= 0, "invalid P refidx\n");
X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "P refidx out of range\n");
const WeightParam *wp0 = cu.m_slice->m_weightPredTable[0][refIdx0];
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
cu.clipMv(mv0);
//如果启用了加权预测并且权重参数存在,则根据权重参数计算加权值
if (cu.m_slice->m_pps->bUseWeightPred && wp0->wtPresent)
{
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
{ //对于亮度和色度平面,计算权重值 wv0,包括权重值 w、偏移值 offset、右移位数 shift 和舍入值 round
wv0[plane].w = wp0[plane].inputWeight;
wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
wv0[plane].shift = wp0[plane].log2WeightDenom;
wv0[plane].round = wp0[plane].log2WeightDenom >= 1 ? 1 << (wp0[plane].log2WeightDenom - 1) : 0;
}
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)//如果需要亮度平面补偿,则调用 predInterLumaShort 函数对亮度平面进行加权预测
predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
if (bChroma)//如果需要色度平面补偿,则调用 predInterChromaShort 函数对色度平面进行加权预测
predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
//使用加权预测的结果,通过调用 addWeightUni 函数将加权预测结果与预测单元的 predYuv 相加
addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{ //如果未启用加权预测或者权重参数不存在,则执行像素级的运动补偿
if (bLuma)//如果需要亮度平面补偿,则调用 predInterLumaPixel 函数对亮度平面进行像素级的运动补偿
predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
if (bChroma)
predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
}
}
else
{
/* B Slice */
WeightValues wv0[3], wv1[3];
const WeightParam *pwp0, *pwp1;
X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "bidir refidx0 out of range\n");
X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "bidir refidx1 out of range\n");
if (cu.m_slice->m_pps->bUseWeightedBiPred)
{
pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
{
/* biprediction weighting */
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
{
wv0[plane].w = pwp0[plane].inputWeight;
wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
wv0[plane].shift = pwp0[plane].log2WeightDenom;
wv0[plane].round = 1 << pwp0[plane].log2WeightDenom;
wv1[plane].w = pwp1[plane].inputWeight;
wv1[plane].o = pwp1[plane].inputOffset * (1 << (X265_DEPTH - 8));
wv1[plane].shift = wv0[plane].shift;
wv1[plane].round = wv0[plane].round;
}
}
else
{
/* uniprediction weighting, always outputs to wv0 */
const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
{
wv0[plane].w = pwp[plane].inputWeight;
wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));
wv0[plane].shift = pwp[plane].log2WeightDenom;
wv0[plane].round = pwp[plane].log2WeightDenom >= 1 ? 1 << (pwp[plane].log2WeightDenom - 1) : 0;
}
}
}
else
pwp0 = pwp1 = NULL;
if (refIdx0 >= 0 && refIdx1 >= 0)
{
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
cu.clipMv(mv0);
cu.clipMv(mv1);
if (bLuma)
{
predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
}
if (bChroma)
{
predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
}
if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
else
predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);
}
else if (refIdx0 >= 0)
{
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
cu.clipMv(mv0);
if (pwp0 && pwp0->wtPresent)
{
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
if (bChroma)
predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
if (bChroma)
predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
}
}
else
{
MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
cu.clipMv(mv1);
/* uniprediction to L1 */
X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");
if (pwp1 && pwp1->wtPresent)
{
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
if (bChroma)
predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
if (bChroma)
predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
}
}
}
}
5.skip模式计算率失真代价Search::encodeResAndCalcRdSkipCU
用于对跳过(Skip)模式进行残差编码并计算率失真的代价,对应代码分析如下:
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
{ //函数从传入的interMode中获取相关参数,如CU数据、重建图像(reconYuv)、原始图像(fencYuv)和预测图像(predYuv)等
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
const Yuv* fencYuv = interMode.fencYuv;
Yuv* predYuv = &interMode.predYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
uint32_t depth = cu.m_cuDepth[0];
// No residual coding : SKIP mode
//设置CU的预测模式为SKIP模式,并清除CBF(系数非零标志)
cu.setPredModeSubParts(MODE_SKIP);
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
//将重建图像设置为预测图像的副本
reconYuv->copyFromYuv(interMode.predYuv);
// Luma 对亮度分量(Luma)进行失真计算,计算方法是使用SSE(平方误差和)
int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
interMode.distortion = interMode.lumaDistortion;
// Chroma 如果色度格式不是X265_CSP_I400(即非单色度格式),则对色度分量(Chroma)也进行失真计算,并将其添加到总失真中
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
interMode.distortion += interMode.chromaDistortion;
}//将总失真存储在CU的m_distortion数组中
cu.m_distortion[0] = interMode.distortion;
m_entropyCoder.load(m_rqt[depth].cur);//加载熵编码器,并重置比特数
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)//如果启用了变换绕过标志(Transquant Bypass),则编码CU的绕过标志
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);//编码CU的跳过标志(skipFlag)所占的比特数
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codeMergeIndex(cu, 0);//编码CU的合并索引(mergeIndex),并计算跳过标志所占的比特数
interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
interMode.coeffBits = 0;
interMode.totalBits = interMode.mvBits + skipFlagBits;//计算运动矢量(mv)的比特数,并将其与跳过标志的比特数相加,得到总比特数
if (m_rdCost.m_psyRd)//如果启用了心理视觉优化(psyRd),则计算心理能量
interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
else if(m_rdCost.m_ssimRd)//如果启用了SSIM优化(ssimRd),则计算SSIM能量
interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
//计算残差能量,即原始图像与预测图像之间的SSE
interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
updateModeCost(interMode);//更新模式的代价(cost)
m_entropyCoder.store(interMode.contexts);//存储模式的上下文(contexts)
}
6.merge模式计算率失真代价Search::encodeResAndCalcRdInterCU
用于对merge模式进行残差编码并计算率失真的代价(不仅支持merge模式的RD计算,也支持其他,除了skip模式),对应代码分析如下:
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
{
ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
//函数从传入的interMode和cuGeom中获取相关参数,如CU数据、重建图像(reconYuv)、预测图像(predYuv)、CU的深度(depth)和CU的大小的对数(log2CUSize
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
Yuv* predYuv = &interMode.predYuv;
uint32_t depth = cuGeom.depth;
ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
const Yuv* fencYuv = interMode.fencYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
uint32_t log2CUSize = cuGeom.log2CUSize;
int sizeIdx = log2CUSize - 2;
//使用预测图像和原始图像计算残差图像(resiYuv)
resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
uint32_t tuDepthRange[2];
cu.getInterTUQtDepthRange(tuDepthRange, 0);//获取CU的亮度变换深度范围(tuDepthRange)
//加载熵编码器
m_entropyCoder.load(m_rqt[depth].cur);
//根据限制条件(limitTU)和邻域限制(X265_TU_LIMIT_NEIGH),选择不同的TU(变换单元)深度估计方法
if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
m_maxTUDepth = -1;
else if (m_limitTU & X265_TU_LIMIT_BFS)
memset(&m_cacheTU, 0, sizeof(TUInfoCache));
Cost costs;
if (m_limitTU & X265_TU_LIMIT_NEIGH)
{ //如果存在邻域限制,则在计算之前保存并重新加载最大TU深度(m_maxTUDepth)
/* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
int32_t tempDepth = m_maxTUDepth;
if (m_maxTUDepth != -1)
{
uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
uint32_t minSize = tuDepthRange[0];
uint32_t maxSize = tuDepthRange[1];
maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
}//调用estimateResidualQT函数进行残差估计
estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
m_maxTUDepth = tempDepth;
}
else
estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
uint32_t tqBypass = cu.m_tqBypass[0];
if (!tqBypass)
{ //使用原始图像和预测图像计算亮度分量(Luma)的SSE(平方误差和)
sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{ //如果颜色空间不是X265_CSP_I400(即非单色度格式),则还计算色度分量(Chroma)的SSE,并将其添加到总失真(cbf0Dist)中
cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
}
//加载熵编码器,并重置比特数
/* Consider the RD cost of not signaling any residual */
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
m_entropyCoder.codeQtRootCbfZero();
uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
uint32_t cbf0Energy; uint64_t cbf0Cost;
if (m_rdCost.m_psyRd)
{ //如果启用了心理视觉优化(psyRd),则计算心理能量(cbf0Energy)
cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
}
else if(m_rdCost.m_ssimRd)
{ //如果启用了SSIM优化(ssimRd),则计算SSIM能量(cbf0Energy)
cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
}
else//计算不传输任何残差的代价(cbf0Cost)
cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
//如果不传输残差的代价(cbf0Cost)比当前的代价(costs.rdcost)更小,则清除CBF并将变换深度设置为0
if (cbf0Cost < costs.rdcost)
{
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
}
}
//如果CBF非零,则保存残差数据
if (cu.getQtRootCbf(0))
saveResidualQTData(cu, *resiYuv, 0, 0);
//首先,加载当前变换深度(depth)对应的熵编码器
/* calculate signal bits for inter/merge/skip coded CU */
m_entropyCoder.load(m_rqt[depth].cur);
//重置比特数
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
//如果CU启用了合并标志(mergeFlag)且分块大小为SIZE_2Nx2N且根节点的CBF为0,则将预测模式设置为MODE_SKIP,并对合并标志、运动矢量进行编码
uint32_t coeffBits, bits, mvBits;
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
{
cu.setPredModeSubParts(MODE_SKIP);
/* Merge/Skip */
coeffBits = mvBits = 0;
m_entropyCoder.codeSkipFlag(cu, 0);
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codeMergeIndex(cu, 0);
mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
bits = mvBits + skipFlagBits;
}
else
{ //否则,对跳过标志(skipFlag)、预测模式、分块大小、预测信息、系数进行编码
m_entropyCoder.codeSkipFlag(cu, 0);
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codePredMode(cu.m_predMode[0]);
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
bits = m_entropyCoder.getNumberOfWrittenBits();
//计算编码系数所占的比特数(coeffBits),编码运动矢量所占的比特数(mvBits),以及总的比特数(bits)
coeffBits = bits - mvBits - skipFlagBits;
}
//使用熵编码器将编码模式的上下文(contexts)存储起来
m_entropyCoder.store(interMode.contexts);
//如果根节点的CBF非零,则将重建的图像(reconYuv)根据预测残差(resiYuv)进行修剪和累加,否则直接从预测图像(predYuv)复制。
if (cu.getQtRootCbf(0))
reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
else
reconYuv->copyFromYuv(*predYuv);
//计算修剪后的失真和代价。首先计算亮度分量(Luma)的SSE(bestLumaDist),并更新编码模式的失真(distortion)为该值。如果颜色空间不是X265_CSP_I400(即非单色度格式),则还计算色度分量(Chroma)的SSE,并将其添加到编码模式的色度失真(chromaDistortion)和总失真(distortion)中
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
interMode.distortion = bestLumaDist;
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
interMode.chromaDistortion = bestChromaDist;
interMode.distortion += bestChromaDist;
}
if (m_rdCost.m_psyRd)//如果启用了心理视觉优化(psyRd),则计算心理能量(psyEnergy)
interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
else if(m_rdCost.m_ssimRd)//如果启用了SSIM优化(ssimRd),则计算SSIM能量(ssimEnergy)
interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
//计算预测残差的能量(resEnergy) 更新编码模式的总比特数(totalBits)、亮度失真(lumaDistortion)、系数比特数(coeffBits)、运动矢量比特数(mvBits)和CU的失真(cu.m_distortion[0])
interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
interMode.totalBits = bits;
interMode.lumaDistortion = bestLumaDist;
interMode.coeffBits = coeffBits;
interMode.mvBits = mvBits;
cu.m_distortion[0] = interMode.distortion;
updateModeCost(interMode);//更新编码模式的代价(modeCost)
checkDQP(interMode, cuGeom);//检查是否需要更新QP值(checkDQP)
}
7.帧间预测Analysis::checkInter_rd0_4
这个函数用于根据不同的复杂度指标对CU进行复杂度检查,以确定是否跳过进一步的分割。
如果recursionSkipMode
等于RDCOST_BASED_RSKIP
,则执行第一个分支。在这个分支中,函数计算了当前最佳模式bestMode
的亮度分量的平均值和方差。它遍历bestMode.fencYuv->m_buf[0]
中的像素值,计算平均值和方差。然后,它将方差与平均值的10%进行比较。如果方差小于平均值的10%,则返回true
,表示当前CU的复杂度较低,可以跳过进一步的分割。否则,返回false
,表示当前CU的复杂度较高,需要进行进一步的分割。
如果recursionSkipMode
不等于RDCOST_BASED_RSKIP
,则执行第二个分支。在这个分支中,函数计算了当前最佳模式bestMode
的亮度分量的边缘方差。它使用primitives.cu[blockType].var
函数计算边缘方差。然后,将边缘方差与参数edgeVarThreshold
进行比较。如果边缘方差大于edgeVarThreshold
,则返回false
,表示当前CU的复杂度较高,需要进行进一步的分割。否则,返回true
,表示当前CU的复杂度较低,可以跳过进一步的分割。
bool Analysis::complexityCheckCU(const Mode& bestMode)
{
if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
{
uint32_t mean = 0;
uint32_t homo = 0;
uint32_t cuSize = bestMode.fencYuv->m_size;
for (uint32_t y = 0; y < cuSize; y++) {
for (uint32_t x = 0; x < cuSize; x++) {
mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
}
}
mean = mean / (cuSize * cuSize);
for (uint32_t y = 0; y < cuSize; y++) {
for (uint32_t x = 0; x < cuSize; x++) {
homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
}
}
homo = homo / (cuSize * cuSize);
if (homo < (.1 * mean))
return true;
return false;
}
else
{
int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
intptr_t stride = m_frame->m_fencPic->m_stride;
intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
uint32_t sum = (uint32_t)sum_ss;
uint32_t ss = (uint32_t)(sum_ss >> 32);
uint32_t pixelCount = 1 << shift;
double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
return false;
else
return true;
}
}
8.帧间预测Analysis::checkInter_rd0_4
对应代码分析如下:
void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
{ //初始化帧间预测模式(interMode)的成本
interMode.initCosts();
interMode.cu.setPartSizeSubParts(partSize);
interMode.cu.setPredModeSubParts(MODE_INTER);
int numPredDir = m_slice->isInterP() ? 1 : 2;//根据切片类型确定预测方向数目
if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
{
int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
int index = 0;
//则从已重用的数据中获取参考图像(m_reuseRef)并分配给每个子块的最佳运动估计结果(bestME)
uint32_t numPU = interMode.cu.getNumPartInter(0);
for (uint32_t part = 0; part < numPU; part++)
{
MotionData* bestME = interMode.bestME[part];
for (int32_t i = 0; i < numPredDir; i++)
bestME[i].ref = m_reuseRef[refOffset + index++];
}
}
if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
{
uint32_t numPU = interMode.cu.getNumPartInter(0);
for (uint32_t part = 0; part < numPU; part++)
{
MotionData* bestME = interMode.bestME[part];
for (int32_t i = 0; i < numPredDir; i++)
{ //从已重用的数据中获取参考图像索引(ref)、运动矢量(mv)和运动矢量预测索引(mvpIdx)并分配给每个子块的最佳运动估计结果(bestME)
int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
bestME[i].ref = ref[cuGeom.absPartIdx];
bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
}
}
}//调用predInterSearch函数进行帧间预测搜索,得到运动矢量和相关数据,并设置参考图像掩码(refMask)
predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
/* predInterSearch sets interMode.sa8dBits 计算预测和原始图像之间的失真*/
const Yuv& fencYuv = *interMode.fencYuv;
Yuv& predYuv = interMode.predYuv;
int part = partitionFromLog2Size(cuGeom.log2CUSize);//对亮度分量进行SA8D(Sum of Absolute 8x8 Differences)计算,并累加到失真(distortion)中
interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
{ //如果启用了色度SA8D且色度格式不是I400,则对色度分量进行SA8D计算,并累加到失真中
interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
}//使用失真和比特数计算SA8D成本(sa8dCost)
interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
//如果满足条件(m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU),将每个子块的最佳参考索引(bestME[i].ref)保存到已重用的数据中
if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
{
int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
int index = 0;
uint32_t numPU = interMode.cu.getNumPartInter(0);
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
{
MotionData* bestME = interMode.bestME[puIdx];
for (int32_t i = 0; i < numPredDir; i++)
m_reuseRef[refOffset + index++] = bestME[i].ref;
}
}
}
9.帧间预测搜索Search::predInterSearch
帧间预测搜索,会比较merge模式代价、单向预测模式代价和双向预测模式代价,从而选取最优模式,对应代码分析如下:
/* find the best inter prediction for each PU of specified mode */
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
{
ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
CUData& cu = interMode.cu;//获取当前CU(Coding Unit)的数据结构
Yuv* predYuv = &interMode.predYuv;//获取当前CU的预测YUV图像
// 12 mv candidates including lowresMV
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
const Slice *slice = m_slice;
int numPart = cu.getNumPartInter(0);//获取CU中的PU的数量
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
uint32_t lastMode = 0;
int totalmebits = 0;
MV mvzero(0, 0);
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
MergeData merge;
memset(&merge, 0, sizeof(merge));
bool useAsMVP = false;
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
MotionData* bestME = interMode.bestME[puIdx];
PredictionUnit pu(cu, cuGeom, puIdx);//设置运动估计的源图像
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
useAsMVP = false;
x265_analysis_inter_data* interDataCTU = NULL;
int cuIdx;
cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1)
{ //如果当前CU的预测模式、分区大小等与先前分析数据匹配,则将其作为运动矢量预测的候选
interDataCTU = m_frame->m_analysisData.interData;
if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
&& (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
&& !(interDataCTU->mergeFlag[cuIdx + puIdx])
&& (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
useAsMVP = true;
}
/* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
bestME[0].cost = MAX_UINT;//将最佳运动估计的成本初始化为最大值
bestME[1].cost = MAX_UINT;
//获取块的比特数
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
bool bDoUnidir = true;//用于标记是否进行单向预测
//获取邻居块的运动矢量
cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
/* Uni-directional prediction *///对于单向预测,默认配置一般不进入该分支
if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
|| (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
{
for (int list = 0; list < numPredDir; list++)
{
//获取参考帧索引
int ref = -1;
if (useAsMVP)
ref = interDataCTU->refIdx[list][cuIdx + puIdx];
else
ref = bestME[list].ref;
if (ref < 0)
{
continue;
}//计算比特数
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
//选择最佳运动矢量预测(MVP)并获取MVP索引
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp;
if (useAsMVP)
{
mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
}
else
mvp = amvp[mvpIdx];
if (m_param->searchMethod == X265_SEA)
{
int puX = puIdx & 1;
int puY = puIdx >> 1;
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
}//设置搜索范围
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
MV mvpIn = mvp;
int satdCost;
if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
mvpIn = bestME[list].mv;
if (useAsMVP && m_param->mvRefine > 1)
{
MV bestmv, mvpSel[3];
int mvpIdxSel[3];
satdCost = m_me.COST_MAX;
mvpSel[0] = mvp;
mvpIdxSel[0] = mvpIdx;
mvpIdx = selectMVP(cu, pu, amvp, list, ref);
mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
mvpIdxSel[1] = mvpIdx;
if (m_param->mvRefine > 2)
{
mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
mvpIdxSel[2] = !mvpIdx;
}
for (int cand = 0; cand < m_param->mvRefine; cand++)
{
if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
continue;//设置搜索范围
setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);
int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
if (satdCost > bcost)
{
satdCost = bcost;
outmv = bestmv;
mvp = mvpSel[cand];
mvpIdx = mvpIdxSel[cand];
}
}
mvpIn = mvp;
}
else
{
satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
}
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvpIdx, bits, cost */
if (!(m_param->analysisMultiPassRefine || useAsMVP))
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
else
{
/* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here
the actual mvp is bestME from pass 1 for that mvpIdx */
int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
if (diffBits < 0)
{
mvpIdx = !mvpIdx;
uint32_t origOutBits = bits;
bits = origOutBits + diffBits;
cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
}
mvp = amvp[mvpIdx];
}
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
bestME[list].ref = ref;
}
bDoUnidir = false;
}
}
else if (m_param->bDistributeMotionEstimation)
{
PME pme(*this, interMode, cuGeom, pu, puIdx);
pme.m_jobTotal = 0;
pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
int idx = 0;
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
if (!(refMask & (1 << ref)))
continue;
pme.m_jobs.ref[list][idx++] = ref;
pme.m_jobTotal++;
}
pme.m_jobs.refCnt[list] = idx;
/* the second list ref bits start at bit 16 */
refMask >>= 16;
}
if (pme.m_jobTotal > 2)
{
pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
processPME(pme, *this);
int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
bDoUnidir = false;
ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
pme.waitForExit();
}
/* if no peer threads were bonded, fall back to doing unidirectional
* searches ourselves without overhead of singleMotionEstimation() */
}
if (bDoUnidir)//单向运动估计
{
interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
//根据参考掩码refMask和当前参考图像的索引ref,判断是否需要跳过当前参考图像的估计过程。如果需要跳过,则继续下一个参考图像的估计
if (!(refMask & (1 << ref)))
{
ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
continue;
}
//接着,计算一些比特数,并根据参考图像、预测方向和其他参数获取运动矢量候选列表
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
//选择一个最佳的运动矢量预测(MVP)作为初始的MVP,并进行一些相关计算和操作
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
bool bLowresMVP = false;
if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */
{
MV lmv = getLowresMV(cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
if (m_param->bEnableHME)
mvp_lowres = lmv;
}
if (m_param->searchMethod == X265_SEA)
{
int puX = puIdx & 1;
int puY = puIdx >> 1;
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
}//根据搜索范围和其他参数,使用运动估计算法估计当前参考图像的最佳运动矢量,并计算其相关的代价
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
//如果允许低分辨率的MVP,并且低分辨率的MVP的代价小于当前参考图像的代价,则选择低分辨率的MVP,并更新相关的代价和比特数
if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
{
MV outmv_lowres;
setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
if (lowresMvCost < satdCost)
{
outmv = outmv_lowres;
satdCost = lowresMvCost;
bLowresMVP = true;
}
}
//根据运动矢量的比特数和代价,计算总的代价
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
/* Update LowresMVP to best AMVP cand 更新最佳运动矢量预测(MVP)以及相关的代价和比特数*/
if (bLowresMVP)
updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
//如果当前参考图像的代价小于当前预测方向的最佳代价,则更新最佳运动矢量和相关信息
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].ref = ref;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
}
}
/* the second list ref bits start at bit 16 */
refMask >>= 16;
}
}
/* Bi-directional prediction */
MotionData bidir[2];
uint32_t bidirCost = MAX_UINT;
int bidirBits = 0;
//它检查是否适用于双向预测的情况,包括当前CU是否为Inter-B模式,是否允许使用双向预测,以及当前PU的分割大小是否为2Nx2N(2Nx2N的双向预测在其他地方处理)
if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{ //代码将获取两个最佳的单向预测结果(bestME[0]和bestME[1]),并将它们存储在bidir[0]和bidir[1]中
bidir[0] = bestME[0];
bidir[1] = bestME[1];
int satdCost;
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
/* Generate reference subpels */
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
//计算双向预测所需的比特数,并将其存储在bidirBits中。然后,将代价与比特数结合使用rdCost.getCost函数计算总代价,并将其存储在bidirCost中
bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
//代码接着检查是否尝试使用零运动矢量进行预测。如果bestME[0]和bestME[1]的运动矢量不为零,则尝试使用零运动矢量进行预测
bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{ //在这种情况下,代码会进行进一步的计算和比较,包括计算零运动矢量的代价、比特数,并进行MVP(运动矢量预测)的选择和更新
/* Do not try zero MV if unidir motion predictors are beyond
* valid search area */
MV mvmin, mvmax;
int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
setSearchRange(cu, mvzero, merange, mvmin, mvmax);
mvmax.y += 2; // there is some pad for subpel refine
mvmin <<= 2;
mvmax <<= 2;
bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
/* coincident blocks of the two reference pictures */
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
intptr_t refStride = slice->m_mref[0][0].lumaStride;
primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
MV mvp0 = bestME[0].mvp;
int mvpIdx0 = bestME[0].mvpIdx;
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
MV mvp1 = bestME[1].mvp;
int mvpIdx1 = bestME[1].mvpIdx;
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
//代码比较使用零运动矢量和双向预测运动矢量的代价,并选择代价较小的作为最终的双向预测结果。最终的双向预测结果存储在bidir数组中,而相关的比特数和代价存储在bidirBits和bidirCost中
if (cost < bidirCost)
{
bidir[0].mv = mvzero;
bidir[1].mv = mvzero;
bidir[0].mvp = mvp0;
bidir[1].mvp = mvp1;
bidir[0].mvpIdx = mvpIdx0;
bidir[1].mvpIdx = mvpIdx1;
bidirCost = cost;
bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
}
}
}
//这段代码的目的是选择最佳的预测模式,并将其存储在CU(Coding Unit,编码单元)中
/* select best option and store into CU */
if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{//比较了合并模式(merge mode)的代价(mrgCost)与双向预测(bidirCost)和单向预测(bestME[0].cost和bestME[1].cost)的代价。如果合并模式的代价最小,那么选择合并模式作为最佳模式
cu.m_mergeFlag[pu.puAbsPartIdx] = true;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
totalmebits += merge.bits;
}//如果双向预测的代价最小
else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}//如果双向预测的代价不是最小的,而单向预测的代价最小,则分别选择单向预测方向0和1作为最佳模式
else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[0].bits;
}
else
{
lastMode = 1;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[1].bits;
}
motionCompensation(cu, pu, *predYuv, true, bChromaMC);
}//最后一行代码将总的运动估计比特数(totalmebits)累加到interMode.sa8dBits中
interMode.sa8dBits += totalmebits;
}
10.运动估计MotionEstimate::motionEstimate
实现AMVP过程,以候选最优mv为起点,进行先进行二分之一亚像素运动估计,再进行四分之一亚像素运动估计,对应代码分析如下:
int MotionEstimate::motionEstimate(ReferencePlanes *ref,
const MV & mvmin,
const MV & mvmax,
const MV & qmvp,
int numCandidates,
const MV * mvc,
int merange,
MV & outQMv,
uint32_t maxSlices,
pixel * srcReferencePlane)
{ //根据一些参数和条件进行一些初始化操作,包括确定是否使用低分辨率参考图像、计算图像偏移、获取参考图像和当前图像的指针等
ALIGN_VAR_16(int, costs[16]);
bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
if (ctuAddr >= 0)
blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
pixel* fenc = fencPUYuv.m_buf[0];
pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
//设置初始的运动矢量预测(MVP)为给定的qmvp
setMVP(qmvp);
//放大4倍,转换为1/4像素的向量
MV qmvmin = mvmin.toQPel();
MV qmvmax = mvmax.toQPel();
/* The term cost used here means satd/sad values for that particular search.
* The costs used in ME integer search only includes the SAD cost of motion
* residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
* cost of residual and sqrtLambda * MVD bits. Mode decision will be based
* on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
* (mode + MVD bits). */
// measure SAD cost at clipped QPEL MVP 根据给定的mvmin和mvmax对qmvp进行裁剪,得到pmv
MV pmv = qmvp.clipped(qmvmin, qmvmax);
MV bestpre = pmv;
int bprecost;
//并计算pmv该运动矢量的初始代价bprecost
if (ref->isLowres)//如果参考图像为低分辨率图像,则使用低分辨率QPel代价函数进行计算,否则使用亚像素比较函数进行计算
bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
else
bprecost = subpelCompare(ref, pmv, sad);
/* re-measure full pel rounded MVP with SAD as search start point */
MV bmv = pmv.roundToFPel();
int bcost = bprecost;
if (pmv.isSubpel())
bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
//如果pmv不为零,则测量运动矢量为零时的代价,并与初始代价进行比较,选择代价较小的作为最佳运动矢量
// measure SAD cost at MV(0) if MVP is not zero
if (pmv.notZero())
{
int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
if (cost < bcost)
{
bcost = cost;
bmv = 0;
bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
}
}
X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
// measure SAD cost at each QPEL motion vector candidate
for (int i = 0; i < numCandidates; i++)//对于每个给定的运动矢量候选项mvc
{ //将候选项裁剪到mvmin和mvmax的范围内
MV m = mvc[i].clipped(qmvmin, qmvmax);
if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
{//如果候选项不为零且不等于pmv和bestpre(避免重复测量),则使用亚像素比较函数计算该候选项的代价,并与当前最佳代价进行比较,更新最佳运动矢量和代价
int cost = subpelCompare(ref, m, sad) + mvcost(m);
if (cost < bprecost)
{
bprecost = cost;
bestpre = m;
}
}
}
//将pmv取整到QPel,并将bmv初始化为pmv
pmv = pmv.roundToFPel();
MV omv = bmv; // current search origin or starting point
int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
switch (search)
{
case X265_DIA_SEARCH://这里使用的是钻石搜索方法(DIAMOND_SEARCH)
{
/* diamond search, radius 1 */
bcost <<= 4;//设置初始的搜索代价为bmv的代价的16倍
int i = merange;
do
{ //进行一系列搜索步骤,每步选择当前位置周围的四个方向进行搜索,并更新最佳代价和最佳运动矢
COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
if (!(bcost & 15))
break;
bmv.x -= (bcost << 28) >> 30;
bmv.y -= (bcost << 30) >> 30;
bcost &= ~15;
}
while (--i && bmv.checkRange(mvmin, mvmax));//搜索步骤的次数由merange确定,且在满足条件的情况下进行搜索
bcost >>= 4;//最终将搜索代价右移4位,得到最终的搜索代价
break;
}
case X265_HEX_SEARCH:
{
me_hex2:
/* hexagon search, radius 2 */
#if 0
for (int i = 0; i < merange / 2; i++)
{
omv = bmv;
COST_MV(omv.x - 2, omv.y);
COST_MV(omv.x - 1, omv.y + 2);
COST_MV(omv.x + 1, omv.y + 2);
COST_MV(omv.x + 2, omv.y);
COST_MV(omv.x + 1, omv.y - 2);
COST_MV(omv.x - 1, omv.y - 2);
if (omv == bmv)
break;
if (!bmv.checkRange(mvmin, mvmax))
break;
}
#else // if 0 用于执行六边形和方形搜索过程
/* equivalent to the above, but eliminates duplicate candidates */
COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs);
bcost <<= 3;//根据当前的运动矢量bmv和一些预定义的偏移量进行搜索。搜索过程按照特定的顺序,依次计算候选运动矢量的代价,并与当前最佳代价进行比较。如果候选运动矢量在允许的范围内且代价较小,则更新最佳运动矢量和代价。搜索过程中,根据当前的方向选择相应的偏移量,并在每次搜索后更新方向和运动矢量
if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
{
COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
}
COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs);
if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
{
COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
}
if (bcost & 7)
{
int dir = (bcost & 7) - 2;
if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
{
bmv += hex2[dir + 1];
//如果检测到最优点不是当前点,则按照半六边形方式往最优点方向遍历
/* half hexagon, not overlapping the previous iteration */
for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
{
COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
hex2[dir + 1].x, hex2[dir + 1].y,
hex2[dir + 2].x, hex2[dir + 2].y,
costs);
bcost &= ~7;
if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
if (!(bcost & 7))
break;
dir += (bcost & 7) - 2;
dir = mod6m1[dir + 1];
bmv += hex2[dir + 1];
}
} // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
}
bcost >>= 3;
#endif // if 0
//方形搜索将当前最佳运动矢量作为中心,按照固定的偏移量进行搜索。类似于六边形搜索,方形搜索也计算候选运动矢量的代价,并与当前最佳代价进行比较。如果候选运动矢量在允许的范围内且代价较小,则更新最佳运动矢量和代价
/* square refine */
int dir = 0;
COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
COPY2_IF_LT(bcost, costs[0], dir, 1);
if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
COPY2_IF_LT(bcost, costs[1], dir, 2);
COPY2_IF_LT(bcost, costs[2], dir, 3);
COPY2_IF_LT(bcost, costs[3], dir, 4);
COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
COPY2_IF_LT(bcost, costs[0], dir, 5);
if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
COPY2_IF_LT(bcost, costs[1], dir, 6);
if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
COPY2_IF_LT(bcost, costs[2], dir, 7);
if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
COPY2_IF_LT(bcost, costs[3], dir, 8);
bmv += square1[dir];
break;
}
case X265_UMH_SEARCH:
{
int ucost1, ucost2;
int16_t cross_start = 1;
/* refine predictors */
omv = bmv;
ucost1 = bcost;
X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
DIA1_ITER(pmv.x, pmv.y);
if (pmv.notZero())
DIA1_ITER(0, 0);
ucost2 = bcost;
if (bmv.notZero() && bmv != pmv)
DIA1_ITER(bmv.x, bmv.y);
if (bcost == ucost2)
cross_start = 3;
/* Early Termination */
omv = bmv;
if (bcost == ucost2 && SAD_THRESH(2000))
{
COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
if (bcost == ucost1 && SAD_THRESH(500))
break;
if (bcost == ucost2)
{
int16_t range = (int16_t)(merange >> 1) | 1;
CROSS(3, range, range);
COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
if (bcost == ucost2)
break;
cross_start = range + 2;
}
}
// TODO: Need to study x264's logic for building mvc list to understand why they
// have special cases here for 16x16, and whether they apply to HEVC CTU
// adaptive search range based on mvc variability
if (numCandidates)
{
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
* these have not been tuned much by actual encoding. */
static const uint8_t range_mul[4][4] =
{
{ 3, 3, 4, 4 },
{ 3, 4, 4, 4 },
{ 4, 4, 4, 5 },
{ 4, 4, 5, 6 },
};
int mvd;
int sad_ctx, mvd_ctx;
int denom = 1;
if (numCandidates == 1)
{
if (LUMA_64x64 == partEnum)
/* mvc is probably the same as mvp, so the difference isn't meaningful.
* but prediction usually isn't too bad, so just use medium range */
mvd = 25;
else
mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
}
else
{
/* calculate the degree of agreement between predictors. */
/* in 64x64, mvc includes all the neighbors used to make mvp,
* so don't count mvp separately. */
denom = numCandidates - 1;
mvd = 0;
if (partEnum != LUMA_64x64)
{
mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
denom++;
}
mvd += predictorDifference(mvc, numCandidates);
}
sad_ctx = SAD_THRESH(1000) ? 0
: SAD_THRESH(2000) ? 1
: SAD_THRESH(4000) ? 2 : 3;
mvd_ctx = mvd < 10 * denom ? 0
: mvd < 20 * denom ? 1
: mvd < 40 * denom ? 2 : 3;
merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
}
/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
* we are still centered on the same place as the DIA2. is this desirable? */
CROSS(cross_start, merange, merange >> 1);
COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
/* hexagon grid */
omv = bmv;
const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
uint16_t i = 1;
do
{
if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
mvmax.y - omv.y, omv.y - mvmin.y))
{
for (int j = 0; j < 16; j++)
{
MV mv = omv + (hex4[j] * i);
if (mv.checkRange(mvmin, mvmax))
COST_MV(mv.x, mv.y);
}
}
else
{
int16_t dir = 0;
pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
size_t dy = (size_t)i * stride;
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
sad_x4(fenc, \
fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
stride, costs + 4 * k); \
fref_base += 2 * dy;
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
#define MIN_MV(k, dx, dy) if ((omv.y + (dy) >= mvmin.y) & (omv.y + (dy) <= mvmax.y)) { COPY2_IF_LT(bcost, costs[k], dir, dx * 16 + (dy & 15)) }
SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
ADD_MVCOST(0, 0, -4);
ADD_MVCOST(1, 0, 4);
ADD_MVCOST(2, -2, -3);
ADD_MVCOST(3, 2, -3);
ADD_MVCOST(4, -4, -2);
ADD_MVCOST(5, 4, -2);
ADD_MVCOST(6, -4, -1);
ADD_MVCOST(7, 4, -1);
ADD_MVCOST(8, -4, 0);
ADD_MVCOST(9, 4, 0);
ADD_MVCOST(10, -4, 1);
ADD_MVCOST(11, 4, 1);
ADD_MVCOST(12, -4, 2);
ADD_MVCOST(13, 4, 2);
ADD_MVCOST(14, -2, 3);
ADD_MVCOST(15, 2, 3);
MIN_MV(0, 0, -4);
MIN_MV(1, 0, 4);
MIN_MV(2, -2, -3);
MIN_MV(3, 2, -3);
MIN_MV(4, -4, -2);
MIN_MV(5, 4, -2);
MIN_MV(6, -4, -1);
MIN_MV(7, 4, -1);
MIN_MV(8, -4, 0);
MIN_MV(9, 4, 0);
MIN_MV(10, -4, 1);
MIN_MV(11, 4, 1);
MIN_MV(12, -4, 2);
MIN_MV(13, 4, 2);
MIN_MV(14, -2, 3);
MIN_MV(15, 2, 3);
#undef SADS
#undef ADD_MVCOST
#undef MIN_MV
if (dir)
{
bmv.x = omv.x + i * (dir >> 4);
bmv.y = omv.y + i * ((dir << 28) >> 28);
}
}
}
while (++i <= merange >> 2);
if (bmv.checkRange(mvmin, mvmax))
goto me_hex2;
break;
}
case X265_STAR_SEARCH: // Adapted from HM ME
{
int bPointNr = 0;
int bDistance = 0;
const int EarlyExitIters = 3;
StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);
if (bDistance == 1)
{
// if best distance was only 1, check two missing points. If no new point is found, stop
if (bPointNr)
{
/* For a given direction 1 to 8, check nearest two outer X pixels
X X
X 1 2 3 X
4 * 5
X 6 7 8 X
X X
*/
int saved = bcost;
const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
if (mv1.checkRange(mvmin, mvmax))
{
COST_MV(mv1.x, mv1.y);
}
if (mv2.checkRange(mvmin, mvmax))
{
COST_MV(mv2.x, mv2.y);
}
if (bcost == saved)
break;
}
else
break;
}
const int RasterDistance = 5;
if (bDistance > RasterDistance)
{
// raster search refinement if original search distance was too big
MV tmv;
for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
{
for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
{
if (tmv.x + (RasterDistance * 3) <= mvmax.x)
{
pixel *pix_base = fref + tmv.y * stride + tmv.x;
sad_x4(fenc,
pix_base,
pix_base + RasterDistance,
pix_base + RasterDistance * 2,
pix_base + RasterDistance * 3,
stride, costs);
costs[0] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[0], bmv, tmv);
tmv.x += RasterDistance;
costs[1] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[1], bmv, tmv);
tmv.x += RasterDistance;
costs[2] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[2], bmv, tmv);
tmv.x += RasterDistance;
costs[3] += mvcost(tmv << 3);
COPY2_IF_LT(bcost, costs[3], bmv, tmv);
}
else
COST_MV(tmv.x, tmv.y);
}
}
}
while (bDistance > 0)
{
// center a new search around current best
bDistance = 0;
bPointNr = 0;
const int MaxIters = 32;
StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);
if (bDistance == 1)
{
if (!bPointNr)
break;
/* For a given direction 1 to 8, check nearest 2 outer X pixels
X X
X 1 2 3 X
4 * 5
X 6 7 8 X
X X
*/
const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
if (mv1.checkRange(mvmin, mvmax))
{
COST_MV(mv1.x, mv1.y);
}
if (mv2.checkRange(mvmin, mvmax))
{
COST_MV(mv2.x, mv2.y);
}
break;
}
}
break;
}
case X265_SEA:
{
// Successive Elimination Algorithm
const int32_t minX = X265_MAX(omv.x - (int32_t)merange, mvmin.x);
const int32_t minY = X265_MAX(omv.y - (int32_t)merange, mvmin.y);
const int32_t maxX = X265_MIN(omv.x + (int32_t)merange, mvmax.x);
const int32_t maxY = X265_MIN(omv.y + (int32_t)merange, mvmax.y);
const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;
const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;
int16_t* meScratchBuffer = NULL;
int scratchSize = merange * 2 + 4;
if (scratchSize)
{
meScratchBuffer = X265_MALLOC(int16_t, scratchSize);
memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);
}
/* SEA is fastest in multiples of 4 */
int meRangeWidth = (maxX - minX + 3) & ~3;
int w = 0, h = 0; // Width and height of the PU
ALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };
ALIGN_VAR_32(int, encDC[4]);
uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >> 2);
sizesFromPartition(partEnum, &w, &h);
int deltaX = (w <= 8) ? (w) : (w >> 1);
int deltaY = (h <= 8) ? (h) : (h >> 1);
/* Check if very small rectangular blocks which cannot be sub-divided anymore */
bool smallRectPartition = partEnum == LUMA_4x4 || partEnum == LUMA_16x12 ||
partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum == LUMA_4x16;
/* Check if vertical partition */
bool verticalRect = partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
partEnum == LUMA_4x8;
/* Check if horizontal partition */
bool horizontalRect = partEnum == LUMA_64x32 || partEnum == LUMA_32x16 || partEnum == LUMA_16x8 ||
partEnum == LUMA_8x4;
/* Check if assymetric vertical partition */
bool assymetricVertical = partEnum == LUMA_12x16 || partEnum == LUMA_4x16 || partEnum == LUMA_24x32 ||
partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum == LUMA_16x64;
/* Check if assymetric horizontal partition */
bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum == LUMA_16x4 || partEnum == LUMA_32x24 ||
partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum == LUMA_64x16;
int tempPartEnum = 0;
/* If a vertical rectangular partition, it is horizontally split into two, for ads_x2() */
if (verticalRect)
tempPartEnum = partitionFromSizes(w, h >> 1);
/* If a horizontal rectangular partition, it is vertically split into two, for ads_x2() */
else if (horizontalRect)
tempPartEnum = partitionFromSizes(w >> 1, h);
/* We have integral planes introduced to account for assymetric partitions.
* Hence all assymetric partitions except those which cannot be split into legal sizes,
* are split into four for ads_x4() */
else if (assymetricVertical || assymetricHorizontal)
tempPartEnum = smallRectPartition ? partEnum : partitionFromSizes(w >> 1, h >> 1);
/* General case: Square partitions. All partitions with width > 8 are split into four
* for ads_x4(), for 4x4 and 8x8 we do ads_x1() */
else
tempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >> 1, h >> 1);
/* Successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
primitives.pu[tempPartEnum].sad_x4(zero,
fenc,
fenc + deltaX,
fenc + deltaY * FENC_STRIDE,
fenc + deltaX + deltaY * FENC_STRIDE,
FENC_STRIDE,
encDC);
/* Assigning appropriate integral plane */
uint32_t *sumsBase = NULL;
switch (deltaX)
{
case 32: if (deltaY % 24 == 0)
sumsBase = integral[1];
else if (deltaY == 8)
sumsBase = integral[2];
else
sumsBase = integral[0];
break;
case 24: sumsBase = integral[3];
break;
case 16: if (deltaY % 12 == 0)
sumsBase = integral[5];
else if (deltaY == 4)
sumsBase = integral[6];
else
sumsBase = integral[4];
break;
case 12: sumsBase = integral[7];
break;
case 8: if (deltaY == 32)
sumsBase = integral[8];
else
sumsBase = integral[9];
break;
case 4: if (deltaY == 16)
sumsBase = integral[10];
else
sumsBase = integral[11];
break;
default: sumsBase = integral[11];
break;
}
if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum == LUMA_16x16 ||
partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum == LUMA_4x16 ||
partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum == LUMA_48x64 ||
partEnum == LUMA_16x64)
deltaY *= (int)stride;
if (verticalRect)
encDC[1] = encDC[2];
if (horizontalRect)
deltaY = deltaX;
/* ADS and SAD */
MV tmv;
for (tmv.y = minY; tmv.y <= maxY; tmv.y++)
{
int i, xn;
int ycost = p_cost_mvy[tmv.y] << 2;
if (bcost <= ycost)
continue;
bcost -= ycost;
/* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48, 32x8, 8x32, 64x16, 16x64 partitions
* ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions
* ADS_2 for all other rectangular partitions */
xn = ads(encDC,
sumsBase + minX + tmv.y * stride,
deltaY,
fpelCostMvX + minX,
meScratchBuffer,
meRangeWidth,
bcost);
for (i = 0; i < xn - 2; i += 3)
COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,
minX + meScratchBuffer[i + 1], tmv.y,
minX + meScratchBuffer[i + 2], tmv.y);
bcost += ycost;
for (; i < xn; i++)
COST_MV(minX + meScratchBuffer[i], tmv.y);
}
if (meScratchBuffer)
x265_free(meScratchBuffer);
break;
}
case X265_FULL_SEARCH:
{
// dead slow exhaustive search, but at least it uses sad_x4()
MV tmv;
int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
if (ref->isHMELowres)
{
merange = (merange < 0 ? -merange : merange);
mvmin_y = X265_MAX(mvmin.y, -merange);
mvmin_x = X265_MAX(mvmin.x, -merange);
mvmax_y = X265_MIN(mvmax.y, merange);
mvmax_x = X265_MIN(mvmax.x, merange);
}
for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
{
for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
{
if (tmv.x + 3 <= mvmax_x)
{
pixel *pix_base = fref + tmv.y * stride + tmv.x;
sad_x4(fenc,
pix_base,
pix_base + 1,
pix_base + 2,
pix_base + 3,
stride, costs);
costs[0] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[0], bmv, tmv);
tmv.x++;
costs[1] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[1], bmv, tmv);
tmv.x++;
costs[2] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[2], bmv, tmv);
tmv.x++;
costs[3] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[3], bmv, tmv);
}
else
COST_MV(tmv.x, tmv.y);
}
}
break;
}
default:
X265_CHECK(0, "invalid motion estimate mode\n");
break;
}
//如果之前的预测代价(bprecost)小于当前的代价(bcost),则将之前的预测运动矢量(bestpre)作为最佳运动矢量,并更新最佳代价(bcost)为预测代价(bprecost)
if (bprecost < bcost)
{
bmv = bestpre;
bcost = bprecost;
}
else//否则,将当前的运动矢量(bmv)提升为四分之一像素(qpel)精度。这是为了进一步细化运动矢量的精度,以获得更准确的代价估计。
bmv = bmv.toQPel(); // promote search bmv to qpel
const SubpelWorkload& wl = workload[this->subpelRefine];
//根据最大切片数(maxSlices)和运动矢量的y分量(bmv.y)与允许范围的比较,检查运动矢量是否超出了切片的边界。如果超出了范围,则将运动矢量限制在范围内,并重新计算代价。这是为了确保运动矢量在切片边界内,以避免跨切片的运动估计
// check mv range for slice bound
if ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y)))
{
bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);
bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
}
//如果最终的代价(bcost)为零,表示在剪裁后的预测运动矢量处没有残差。在这种情况下,可以跳过亚像素细化过程,但仍需考虑运动矢量的代价(mvcost)。
if (!bcost)
{
/* if there was zero residual at the clipped MVP, we can skip subpel
* refine, but we do need to include the mvcost in the returned cost */
bcost = mvcost(bmv);
}
else if (ref->isLowres)
{ //如果参考图像为低分辨率(isLowres为真),则进行低分辨率亚像素细化过程。
int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{ //首先,通过对当前运动矢量(bmv)进行四分之一像素偏移(square1[i] * 2)来获取候选运动矢量(qmv)
MV qmv = bmv + square1[i] * 2;
//然后,检查候选运动矢量是否在允许范围内,如果不在范围内则跳过。计算候选运动矢量的代价(cost),并与当前最佳代价(bcost)进行比较,更新最佳运动矢量和代价
/* skip invalid range */
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
continue;
//根据选择的最佳方向(bdir),更新当前运动矢量(bmv)和代价(bcost)
int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
//通过对当前运动矢量(bmv)进行二分之一像素偏移(square1[bdir])来更新运动矢量
bmv += square1[bdir] * 2;
bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
/* skip invalid range */
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
continue;
int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
bmv += square1[bdir];
}
else
{ //对之前的运动矢量进行亚像素级别的优化搜索
pixelcmp_t hpelcomp;
//根据亚像素级别的(wl.hpel_satd)进行判断
if (wl.hpel_satd)
{
bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
hpelcomp = satd;
}
else
hpelcomp = sad;
for (int iter = 0; iter < wl.hpel_iters; iter++)
{
int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{ //在每次迭代中,对当前运动矢量(bmv)进行二分之一像素偏移(square1[i] * 2)来获取候选运动矢量(qmv)
MV qmv = bmv + square1[i] * 2;
//检查候选运动矢量是否在允许范围内,如果超出了范围则跳过
// check mv range for slice bound
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
continue;
//与当前最佳代价进行比较。更新最佳运动矢量和代价的方式与之前的代码类似
int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
if (bdir)
bmv += square1[bdir] * 2;
else
break;
}
//亚像素级别的优化搜索分为两个阶段:HPEL(二分之一像素)和QPEL(四分之一像素)
/* if HPEL search used SAD, remeasure with SATD before QPEL */
if (!wl.hpel_satd)
bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
for (int iter = 0; iter < wl.qpel_iters; iter++)
{ //接着进行QPEL搜索,迭代进行,直到没有更好的方向为止
int bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
// check mv range for slice bound
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
continue;
int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
if (bdir)
bmv += square1[bdir];
else
break;
}
}
// check mv range for slice bound
X265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");
x265_emms();
outQMv = bmv;
return bcost;
}
点赞、收藏,会是我继续写作的动力!赠人玫瑰,手有余香。