粗粒度率失真计算:
rdcost = satd(fenc, pred) + lambda * IPM_bits,其中satd在一定程度上表示了频域的能量,弥补了IPM_bits没有计算残差系数等bits开销的不足,该方法计算/时间开销小,因为没有进行变换/量化/反量化/反变换等过程;但是其结果只具有一定代表性,其计算的最优可能并不一定是严格意义上的最优,而只是可能较优。
细粒度率失真计算:
rdcost = sse(fenc, recon) + lambda * all_bits,该方法严格计算了原始帧和重建帧之间的distortion,并对执行了整个编码流程,包括变换/量化/反量化/反变换等等,是真正意义上的拉格朗日码控计算,其计算的最优就是严格意义上的最优,但是计算成本大。
X265在分析最优帧内预测方向中,采用了两种结合的方式,先使用粗粒度计算方式得到一些可能的最优帧内预测方向备选集,然后在这些备选集中用细粒度计算方式得到最后严格意义上的最优帧内预测方向
/*
为当前CU中各个PU分析最优的帧内预测方向,并返回整个CU的distortion
过程:
1.获取depth、initTuDepth、TUsize、PU个数等信息
2.检查是否TransformSkip
3.遍历当前CU的所有PU
1.对当前PU分析其最优帧内预测方向
·若指定了帧内预测方向,则直接将其定为最优帧内预测方向
·否则,进行最优帧内预测方向选择
1.获取相邻PU参考像素可用信息
2.对相邻PU参考像素信息进行填充并平滑滤波
3.加载3个mpms,并得到未命中mpms时的bits开销
4.进行DC帧内预测方向计算
1.进行DC帧内预测
2.得到编码DC帧内预测方向的mode_bits
3.计算distortion = sa8d(fenc, pred)
4.计算存储cost[DC] = distortion + lambda * mode_bits,并将其设置为最优开销bcost
5.进行PLANAR帧内预测方向计算
1.进行PLANAR帧内预测,TUsize在8~32内用平滑滤波后的参考像素,否则使用未滤波的像素
2.得到编码PLANAR帧内预测方向的mode_bits
3.计算distortion = sa8d(fenc, pred)
4.计算存储cost[PLANAR]= distortion + lambda * mode_bits,并基于cost更新bcost
6.进行angle2~34帧内预测方向计算
·若intra_pred_allangs函数定义,则
1.转置fenc矩阵为fenc^
2.进行intra_pred_allangs函数计算,输出angle2~34一共33种预测方向的预测像素
3.遍历angle2~34
1.得到编码当前angle下帧内预测方向的mode_bits
2.计算distortion
·若angle在2~18中,即从水平向右的所有帧内预测方向,则distortion = satd(fenc^, pred)
·否则,即angle在19~34中,也就是垂直向下的那些帧内预测方向,则distortion = satd(fenc, pred)
3.计算cost[angle] = distortion + lambda * mode_bits
·若没有intra_pred_allangs函数定义,则遍历angle2~34帧内预测方向
1.得到编码当前angle下帧内预测方向的mode_bits
2.判断是否使用平滑滤波后的参考像素
3.计算distortion = sa8d(fenc, pred)
4.计算cost[angle] = distortion + lambda * mode_bits
7.选取最多maxCandCount个cost在1.25倍bcost内的帧内预测方向作为帧内预测方向备选集cand
8.遍历所有cand,在cand中寻找严格意义上的最优
1.加载熵编码上下文,并设置好帧内预测方向
2.针对指定的帧内预测方向,严格基于rdcost = sse(fenc, recon) + lambda * all_bits,确定最优的TU划分,并得到rdcost、bits、distortion、energy开销
3.基于rdcost来更新最优开销bcost以及最优帧内预测方向bmode
2.设置得到的最优帧内预测方向
3.载入熵编码上下文
4.再次调用codeIntraLumaTSkip/codeIntraLumaQT来重新得到其残差系数、reconYUV、以及一些开销
5.累加当前PU最优预测方向的distortion到totalDistortion中
6.提取存储保留最优帧内预测方向的残差系数和reconYUV
7.若当前PU不是当前CU的最后一块PU,则保留reconYUV,为下一PU的帧内预测做参考
8.若当前CU划分了多个PU,则merge各个PU的cbf
9.返回totalDistortion
*/
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
{
CUData& cu = intraMode.cu;
//原始帧、预测帧、重建帧
const Yuv* fencYuv = intraMode.fencYuv;
Yuv* predYuv = &intraMode.predYuv;
Yuv* reconYuv = &intraMode.reconYuv;
uint32_t depth = cuGeom.depth; //CU深度
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; //初始TU深度,2Nx2N=>深度0,NxN=>深度1
uint32_t numPU = 1 << (2 * initTuDepth); //PU个数,2Nx2N=>1个,NxN=>4个
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;//TUsize,单位log(pixel)
uint32_t tuSize = 1 << log2TrSize; //TUsize,单位pixel
uint32_t qNumParts = cuGeom.numPartitions >> 2;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t absPartIdx = 0;
sse_t totalDistortion = 0;
//是否跳过transform
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
// loop over partitions 遍历所有PU
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
{
uint32_t bmode = 0;
//若指定了帧内预测方向,即非ALL_IDX,则不用进行帧内预测方向分析了
if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
bmode = intraMode.cu.m_lumaIntraDir[puIdx];
//否则,进行最优帧内预测方向计算
else
{
uint64_t candCostList[MAX_RD_INTRA_MODES];
uint32_t rdModeList[MAX_RD_INTRA_MODES];
uint64_t bcost;
int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
{
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
// Reference sample smoothing
IntraNeighbors intraNeighbors;
//获取neighbor参考像素可用信息
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
//对neighbor像素进行填充,并平滑滤波
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
// determine set of modes to be tested (using prediction signal only)
//取原始YUV及其stride
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
//加载啥???
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
/* there are three cost tiers for intra modes:
* pred[0] - mode probable, least cost
* pred[1], pred[2] - less probable, slightly more cost
* non-mpm modes - all cost the same (rbits) */
uint64_t mpms; //mpms映射,低0~34bit有效
uint32_t mpmModes[3]; //存储三个mpm
//加载mpms,并得到若没有命中mpm时的bits开销
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
//加载相应size的sa8d计算函数指针
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
//存储35个帧内预测方向的cost
uint64_t modeCosts[35];
/* 进行DC帧内预测,并得到其bits、distorton(sa8d)、cost开销,并赋值给bcost*/
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
//根据有没有命中mpm返回不同的bits。这里的bits仅为记录最优帧内预测方向的bits开销
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
//计算sa8d失真
uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
//计算rdcost
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
/* 进行PLANAR帧内预测,并得到其bits、distorton(sa8d)、cost开销,更新bcost*/
//若tuSize再8~32之间,使用平滑滤波后的参考像素,若不在区间内,则使用未平滑滤波的参考像素
pixel* planar = intraNeighbourBuf[0];
if (tuSize >= 8 && tuSize <= 32)
planar = intraNeighbourBuf[1];
//PLANAR帧内预测
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
//bits开销
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
//distortion
sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
//计算cost
modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
//基于cost更新最优帧内预测模式
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
/* 进行angle2~34帧内预测,得到其bits、distorton(sa8d)、cost开销,并更新bcost
intra_pred_allangs只是将33种帧内预测方向集中起来计算而已 */
//若intra_pred_allangs
if (primitives.cu[sizeIdx].intra_pred_allangs)
{
/* 将原始YUC转置,输出到m_fencTransposed
angle2~17的预测方向和angle19~34的预测方向是转置关系 */
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
//进行angle2~34帧内预测,将33个预测的结果全部输出到m_intraPredAngs
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
//遍历angle2~34
for (int mode = 2; mode < 35; mode++)
{
//计算最优帧内预测方向的bits开销
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
//若是angle2~18,则与转置后的YUV矩阵计算sa8d
if (mode < 18)
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
//若是angle19~24,则与原始YUV矩阵计算sa8d
else
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
//得到rdcost
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
//更新最优帧内预测方向
COPY1_IF_LT(bcost, modeCosts[mode]);
}
}
//若非intra_pred_allangs
else
{
//遍历angle2~34
for (int mode = 2; mode < 35; mode++)
{
//计算bits开销
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
//是否用平滑滤波后的参考像素
int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
//以mode方向进行帧内预测
primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
//计算sa8d
sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
//计算rdcost
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
//更新最优帧内预测方向
COPY1_IF_LT(bcost, modeCosts[mode]);
}
}
/* 到这里只是简单的基于
cost = sa8d + lambda * IPM_bits
确定了最优帧内预测开销bcost,
以及35种帧内预测方向各自的rdcost,存储在modeCosts[35]
有意义但并不准确,下面依据bcost缩小帧内预测方向搜索范围,
得到准确的最优帧内预测方向*/
/* Find the top maxCandCount candidate modes with cost within 25% of best
* or among the most probable modes. maxCandCount is derived from the
* rdLevel and depth. In general we want to try more modes at slower RD
* levels and at higher depths */
//初始化candCostList所有为MAX
for (int i = 0; i < maxCandCount; i++)
candCostList[i] = MAX_INT64;
//1.25倍的bcost为阈值
uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
//遍历35种帧内预测方向,在满足条件的帧内预测方向中寻找最优的maxCandCount个,存储到candCostList中
for (int mode = 0; mode < 35; mode++)
//若该帧内预测方向之前简单计算的cost在1.25倍最优帧内预测方向的cost以内,或命中了mpm,则进行更新CandList
if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0]))
/* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
}
/* measure best candidates using simple RDO (no TU splits) */
bcost = MAX_INT64;
//遍历所有Cand,将cand中的每一个帧内预测方向都严格计算一边开销
for (int i = 0; i < maxCandCount; i++)
{
//若其cost为MAX,则break,不需要继续了,candCostList无可用帧内预测方向
if (candCostList[i] == MAX_INT64)
break;
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
//加载熵编码上下文
m_entropyCoder.load(m_rqt[depth].cur);
//设置好帧内预测方向
cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
Cost icosts;
/* 针对指定的帧内预测方向,
严格基于rdcost = sse(fenc, recon) + lambda * all_bits
确定最优的TU划分,并得到rdcost、bits、distortion、energy开销 */
if (checkTransformSkip)
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
//依据rdcost更新bcost和bmode
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
}
/*
到这里已经得到了严格意义上的最优帧内预测方向bmode及其bcost
*/
}
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
/* remeasure best mode, allowing TU splits */
//重新设置刚刚在cand中确定的最优帧内预测方向
cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
//加载熵编码上下文
m_entropyCoder.load(m_rqt[depth].cur);
//再次计算一遍
Cost icosts;
//计算当前intraMod下的最优TU划分,并得到严格的distortion、bits、rdcost和energy
if (checkTransformSkip)
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
//累加上当前PU的distortion
totalDistortion += icosts.distortion;
//将DCT系数和recon的YUV数据提取存储下来
extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
// set reconstruction for next intra prediction blocks
//若不是最后一个PU,则将recon的YUV拷贝下来,为下一个PU作像素参考
if (puIdx != numPU - 1)
{
/* This has important implications for parallelism and RDO. It is writing intermediate results into the
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
* it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
* that the contexts should be tracked through each PU */
PicYuv* reconPic = m_frame->m_reconPic;
pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
uint32_t dststride = reconPic->m_stride;
const pixel* src = reconYuv->getLumaAddr(absPartIdx);
uint32_t srcstride = reconYuv->m_size;
primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
}
}// end of for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
//若CU划分了多个PU,即4个
if (numPU > 1)
{
uint32_t combCbfY = 0;
//merge四个PU的cbf
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
//m_cbf[plane][absPartIdx],记录下来
cu.m_cbf[0][0] |= combCbfY;
}
// TODO: remove this,恢复熵编码上下文
m_entropyCoder.load(m_rqt[depth].cur);
return totalDistortion;
}