【x265】码率控制模块的简单分析—块级码控工具（AQ和cuTree）

东城山

已于 2024-09-13 14:31:18 修改

阅读量978

点赞数 13

分类专栏： x265 文章标签： video-codec h.265 videocodec

于 2024-09-13 14:08:37 首次发布

本文链接：https://blog.csdn.net/weixin_42877471/article/details/141955399

版权

x265 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1. 自适应QP技术（Adaptive QP）

在x265当中定义了5种AQ模式，较之于x264而言，新增了AQ_EDGE这种模式

#define X265_AQ_NONE                 0	// 不使用AQ
#define X265_AQ_VARIANCE             1	// 方差模式，仅考虑当前块
#define X265_AQ_AUTO_VARIANCE        2	// 自方差模式，考虑整帧中的块
#define X265_AQ_AUTO_VARIANCE_BIASED 3  // 带偏置项的自方差模式，考虑整帧中的块，并且带一个可调控的偏置项
#define X265_AQ_EDGE                 4	// 边缘模式

AQ模式的计算位于encoder/slicetype.cpp中，由calcAdaptiveQuantFrame()实现，主要的步骤为：

如果使用hevcAq，则使用xPreanalyze()去分析当前帧
如果使用常规AQ
（1）如果使用X265_AQ_EDGE模式，则先进行滤波，检测边缘纹理（edgeFilter）
（2）如果使用X265_AQ_AUTO_VARIANCE、X265_AQ_ATUO_VARIANCE_BIASED或X265_AQ_EDGE模式中的一种，则根据全局范围的纹理情况来计算qp_adj
（a）如果是X265_AQ_EDGE模式，会计算边缘密度，依据边缘密度计算块级（默认为16x16）qp_adj
（b）如果非X265_AQ_EDGE模式，根据AC energy计算块级qp_adj
（c）将所有块的qp_adj平均计算，得到平均qp_adj
（3）如果是X265_AQ_VARIANCE，根据strength调整qp

void LookaheadTLD::calcAdaptiveQuantFrame(Frame* curFrame, x265_param* param)
{
	/* Actual adaptive quantization */
	int maxCol = curFrame->m_fencPic->m_picWidth;
	int maxRow = curFrame->m_fencPic->m_picHeight;
	int blockCount, loopIncr;
	float modeOneConst, modeTwoConst;
	/*
		qgSize表示量化组大小（quantization group size）
		（1）qg将图像划分成为固定大小的正方形像素块（NxN），同一个qg内所有非零系数的CU使用同一个qp
			不同的qg使用不同的qp
		（2）通过调整qgSize，能够针对不同区域的图像内容，使用不同的qp，从而节省码率
		（3）qgSize默认的大小为32
	*/
	if (param->rc.qgSize == 8)
	{
		blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
		modeOneConst = 11.427f;
		modeTwoConst = 8.f;
		loopIncr = 8;
	}
	else
	{
		blockCount = widthInCU * heightInCU;
		modeOneConst = 14.427f;
		modeTwoConst = 11.f;
		loopIncr = 16;
	}

	float* quantOffsets = curFrame->m_quantOffsets;
	/*
		m_lowres表示低分辨率视频，在编码器中用于优化编码效率和质量
		（1）降低视频分辨率，能够减少编码过程中的计算量，通常降低分辨率为输入视频的1/4
		（2）提升编码质量，低分辨率下的Intra和Inter模式，能够良好的应用于场景检测、帧结构确定及CU Tree优化
		（3）优化帧类型选择，选择合适的帧类型（Intra、Inter、Bi-Inter）
	*/
	for (int y = 0; y < 3; y++)
	{
		curFrame->m_lowres.wp_ssd[y] = 0;
		curFrame->m_lowres.wp_sum[y] = 0;
	}
	// bStatRead表示从文件中读取信息（multi-pass）
	if (!(param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
	{
		/* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
		// 为帧当中的16x16和8x8计算QP偏移量
		// 如果aq模式为NONE或aq强度为0
		if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) 
		{
			if (param->rc.aqMode && param->rc.aqStrength == 0)
			{
				if (quantOffsets)
				{
					for (int cuxy = 0; cuxy < blockCount; cuxy++)
					{
						curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
						curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
					}
				}
				else
				{
					memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
					memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
					for (int cuxy = 0; cuxy < blockCount; cuxy++)
						curFrame->m_lowres.invQscaleFactor[cuxy] = 256; // 初始化为256
				}
			}

			/* Need variance data for weighted prediction and dynamic refinement*/
			if (param->bEnableWeightedPred || param->bEnableWeightedBiPred) // 是否使用加权预测
			{
				for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
					for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
						acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
			}
		}
		else // 当前模式不为X265_AQ_NONE
		{
			// 1.是否使用hevcAq，这是一种新的面向hevc的AQ模式
			if (param->rc.hevcAq)
			{
				// New method for calculating variance and qp offset
				// 提取图像特征并计算图像内容的特性，辅助编码器进行更精确的AQ
				xPreanalyze(curFrame);
			}
			else
			{	// 2.使用常规的AQ
				int blockXY = 0, inclinedEdge = 0;
				double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
				double bias_strength = 0.f;
				double strength = 0.f;
				// 3.如果使用X265_AQ_EDGE模式，则对帧进行高斯滤波和sobel滤波，检测图像的纹理边界
				if (param->rc.aqMode == X265_AQ_EDGE)
					edgeFilter(curFrame, param);

				/*
					aqMode 默认为 X265_AQ_AUTO_VARIANCE
					bHistBasedSceneCut 默认为 0
					recurisonSkipMode 默认为 1
				*/
				if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
				{
					pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
					// 对各个plane进行位移操作
					primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
						curFrame->m_fencPic->m_stride, curFrame->m_fencPic->m_picWidth, curFrame->m_fencPic->m_picHeight, SHIFT_TO_BITPLANE);
				}

				if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
				{
					double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8))); // bitdepth修正
					// 4.计算AC energy（高频信息，也可以理解为图像复杂度），随后调整avg_adj和strength
					for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
					{
						for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
						{
							uint32_t energy, edgeDensity, avgAngle;
							// 计算AC energy
							energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
							if (param->rc.aqMode == X265_AQ_EDGE)
							{
								// Edge模式会计算CU的边缘密度，能够识别出图像中的高对比度区域，这些区域通常包括更多的视觉细节
								edgeDensity = edgeDensityCu(curFrame, avgAngle, blockX, blockY, param->rc.qgSize); // avgAngle是当前块中像素的角度
								if (edgeDensity)
								{	// 依据边缘密度计算qp_adj
									qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1);
									//Increasing the QP of a block if its edge orientation lies around the multiples of 45 degree
									// 正负45°或者是正负90°左右
									if ((avgAngle >= EDGE_INCLINATION - 15 && avgAngle <= EDGE_INCLINATION + 15) || (avgAngle >= EDGE_INCLINATION + 75 && avgAngle <= EDGE_INCLINATION + 105))
										curFrame->m_lowres.edgeInclined[blockXY] = 1; // edgeInclined表示倾向于是edge区域
									else
										curFrame->m_lowres.edgeInclined[blockXY] = 0;
								}
								else // 边缘密度为0，直接使用AC来计算qp_adj
								{
									qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
									curFrame->m_lowres.edgeInclined[blockXY] = 0;
								}
							}
							else // 非edge模式
								qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
							// 写入单个16x16块的qp调整量
							curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
							avg_adj += qp_adj;
							avg_adj_pow2 += qp_adj * qp_adj;
							blockXY++;
						}
					}
					avg_adj /= blockCount;
					avg_adj_pow2 /= blockCount;
					// 根据avg_adj（一帧中平均qp调整量）调整strength，aqStrength默认为1.f
					strength = param->rc.aqStrength * avg_adj;
					// 调整avg_adj（modeTwoConst默认为11.f）
					avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
					bias_strength = param->rc.aqStrength;
				}
				else // 如果是X265_AQ_VARIANCE模式，直接计算strength
					strength = param->rc.aqStrength * 1.0397f;

				// 5.根据不同的AQ模式来调整qp
				blockXY = 0;
				for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
				{
					for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
					{
						if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED) // 自方差并且携带偏置项
						{
							qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
							// modeTwoConst = 11.f
							qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
						}
						else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE) // 自方差模式
						{
							qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
							qp_adj = strength * (qp_adj - avg_adj);
						}
						else if (param->rc.aqMode == X265_AQ_EDGE) // 边缘模式
						{
							inclinedEdge = curFrame->m_lowres.edgeInclined[blockXY];
							qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
							if (inclinedEdge && (qp_adj - avg_adj > 0)) 
								// 期望调整的qp量大于平均值，则加上一个EDGE_BIAS进行调整
								// AQ_EDGE_BIAS = 0.5
								qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj)); 
							else
								qp_adj = strength * (qp_adj - avg_adj);
						}
						else
						{	// 如果是X265_AQ_VARIANCE模式
							uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
							qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
						}

						if (param->bHDR10Opt)
						{
							uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
							uint32_t lumaAvg = sum / (loopIncr * loopIncr);
							if (lumaAvg < 301)
								qp_adj += 3;
							else if (lumaAvg >= 301 && lumaAvg < 367)
								qp_adj += 2;
							else if (lumaAvg >= 367 && lumaAvg < 434)
								qp_adj += 1;
							else if (lumaAvg >= 501 && lumaAvg < 567)
								qp_adj -= 1;
							else if (lumaAvg >= 567 && lumaAvg < 634)
								qp_adj -= 2;
							else if (lumaAvg >= 634 && lumaAvg < 701)
								qp_adj -= 3;
							else if (lumaAvg >= 701 && lumaAvg < 767)
								qp_adj -= 4;
							else if (lumaAvg >= 767 && lumaAvg < 834)
								qp_adj -= 5;
							else if (lumaAvg >= 834)
								qp_adj -= 6;
						}
						if (quantOffsets != NULL)
							qp_adj += quantOffsets[blockXY];
						// 存储qp_adj
						curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
						curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
						curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
						blockXY++;
					}
				}
			}
		}

		if (param->rc.qgSize == 8)
		{
			for (int cuY = 0; cuY < heightInCU; cuY++)
			{
				for (int cuX = 0; cuX < widthInCU; cuX++)
				{
					const int cuXY = cuX + cuY * widthInCU;
					curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
						curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
						curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
						curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
				}
			}
		}
	}
	// 是否使用加权预测
	if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
	{
		if (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame))
		{
			for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
				for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
					acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
		}

		int hShift = CHROMA_H_SHIFT(param->internalCsp);
		int vShift = CHROMA_V_SHIFT(param->internalCsp);
		maxCol = ((maxCol + 8) >> 4) << 4;
		maxRow = ((maxRow + 8) >> 4) << 4;
		int width[3] = { maxCol, maxCol >> hShift, maxCol >> hShift };
		int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };

		for (int i = 0; i < 3; i++)
		{
			uint64_t sum, ssd;
			sum = curFrame->m_lowres.wp_sum[i];
			ssd = curFrame->m_lowres.wp_ssd[i];
			curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
		}
	}
	// 是否使用动态优化或渐入
	if (param->bDynamicRefine || param->bEnableFades)
	{
		uint64_t blockXY = 0, rowVariance = 0;
		curFrame->m_lowres.frameVariance = 0;
		for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
		{
			for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
			{
				curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
				rowVariance += curFrame->m_lowres.blockVariance[blockXY];
				blockXY++;
			}
			curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
		}
		curFrame->m_lowres.frameVariance /= maxRow;
	}
}

从上面的代码中看，4种模式对应的qp_adj方式为：

X265_AQ_VARIANCE
（1）strength = param->rc.aqStrength * 1.0397f
（2）qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)))

X265_AQ_AUTO_VARIANCE
（1）qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
（2）strength = param->rc.aqStrength * avg_adj;（avg_adj为qp_adj的均值）
（3）avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj
（4）qp_adj = strength * (qp_adj - avg_adj)

X265_AQ_AUTO_VARIANCE_BIASED
（1）qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
（2）strength = param->rc.aqStrength * avg_adj;（avg_adj为qp_adj的均值）
（3）avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj
（4）bias_strength = param->rc.aqStrength;
（5）qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj))

X265_AQ_EDGE
（1）计算qp_adj初始值
如果edgeDensity不为0：qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1)
如果edgeDensity为0：qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
（2）调整qp_adj的值
如果当前块为edge块，并且qp_adj - avg_adj > 0：qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj))
其他情况：qp_adj = strength * (qp_adj - avg_adj)

总体上看，X265_AQ_VARIANCE模式为aq调整的基础，如果考虑了当前帧中其他的块，变为X265_AQ_AUTO_VARIANCE模式。如果想要更精确地调控，可以增加一些调控因子，演变为X265_AQ_AUTO_VARIANCE_BIASED。如果考虑不局限于AC energy，增加梯度的检测，演变为X265_AQ_EDGE

1.2 图像纹理的检测（edgeFilter）

函数的主要功能是对当前帧进行图像纹理的检测，具体来说，会对输入图像进行高斯滤波，随后进行sobel滤波获得图像的边界纹理

void edgeFilter(Frame *curFrame, x265_param* param)
{
    int height = curFrame->m_fencPic->m_picHeight;
    int width = curFrame->m_fencPic->m_picWidth;
    intptr_t stride = curFrame->m_fencPic->m_stride;
    uint32_t numCuInHeight = (height + param->maxCUSize - 1) / param->maxCUSize;
    int maxHeight = numCuInHeight * param->maxCUSize;
	// 初始化
    memset(curFrame->m_edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
    memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
    memset(curFrame->m_thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));

    pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
    pixel *edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel *refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;

    for (int i = 0; i < height; i++)
    {
        memcpy(edgePic, src, width * sizeof(pixel));
        memcpy(refPic, src, width * sizeof(pixel));
        src += stride;
        edgePic += stride;
        refPic += stride;
    }

    //Applying Gaussian filter on the picture
	// 进行高斯滤波
    src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
    refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel pixelValue = 0;

    for (int rowNum = 0; rowNum < height; rowNum++)
    {
        for (int colNum = 0; colNum < width; colNum++)
        {
            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
            {
                /*  5x5 Gaussian filter 滤波器
                    [2   4   5   4   2]
                 1  [4   9   12  9   4]
                --- [5   12  15  12  5]
                159 [4   9   12  9   4]
                    [2   4   5   4   2]*/

                const intptr_t rowOne = (rowNum - 2)*stride, colOne = colNum - 2;
                const intptr_t rowTwo = (rowNum - 1)*stride, colTwo = colNum - 1;
                const intptr_t rowThree = rowNum * stride, colThree = colNum;
                const intptr_t rowFour = (rowNum + 1)*stride, colFour = colNum + 1;
                const intptr_t rowFive = (rowNum + 2)*stride, colFive = colNum + 2;
                const intptr_t index = (rowNum*stride) + colNum;
				// 进行高斯滤波
                pixelValue = ((2 * src[rowOne + colOne] + 4 * src[rowOne + colTwo] + 5 * src[rowOne + colThree] + 4 * src[rowOne + colFour] + 2 * src[rowOne + colFive] +
                    4 * src[rowTwo + colOne] + 9 * src[rowTwo + colTwo] + 12 * src[rowTwo + colThree] + 9 * src[rowTwo + colFour] + 4 * src[rowTwo + colFive] +
                    5 * src[rowThree + colOne] + 12 * src[rowThree + colTwo] + 15 * src[rowThree + colThree] + 12 * src[rowThree + colFour] + 5 * src[rowThree + colFive] +
                    4 * src[rowFour + colOne] + 9 * src[rowFour + colTwo] + 12 * src[rowFour + colThree] + 9 * src[rowFour + colFour] + 4 * src[rowFour + colFive] +
                    2 * src[rowFive + colOne] + 4 * src[rowFive + colTwo] + 5 * src[rowFive + colThree] + 4 * src[rowFive + colFour] + 2 * src[rowFive + colFive]) / 159);
                refPic[index] = pixelValue;
            }
        }
    }
	// 对已经进行了高斯滤波的图像再进行sobel滤波
    if(!computeEdge(edgePic, refPic, edgeTheta, stride, height, width, true))
        x265_log(NULL, X265_LOG_ERROR, "Failed edge computation!");
}

computeEdge()的代码为

bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
{
    intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
    intptr_t middle = 0, topLeft = 0, topRight = 0, bottomLeft = 0, bottomRight = 0;

    const int startIndex = 1;

    if (!edgePic || !refPic || (!edgeTheta && bcalcTheta))
    {
        return false;
    }
    else
    {
        float gradientH = 0, gradientV = 0, radians = 0, theta = 0;
        float gradientMagnitude = 0;
        pixel blackPixel = 0;

        //Applying Sobel filter expect for border pixels
		// 对于边界像素应用sobel滤波
        height = height - startIndex;
        width = width - startIndex;
        for (int rowNum = startIndex; rowNum < height; rowNum++)
        {
            rowTwo = rowNum * stride;
            rowOne = rowTwo - stride;
            rowThree = rowTwo + stride;

            for (int colNum = startIndex; colNum < width; colNum++)
            {
				/*
					标准的sobel滤波算子为，x265当中应该是进行了微调
					     [-1  0  1] 	   [-1  -2  -1]
					gH = [-2  0  2]   gV = [0    0   0]
						 [-1  0  1]		   [ 1   2   1]
				*/
                 /*  Horizontal and vertical gradients
                     [ -3   0   3 ]        [-3   -10  -3 ]
                 gH =[ -10  0   10]   gV = [ 0    0    0 ]
                     [ -3   0   3 ]        [ 3    10   3 ] */

                colTwo = colNum;
                colOne = colTwo - startIndex;
                colThree = colTwo + startIndex;
                middle = rowTwo + colTwo;
                topLeft = rowOne + colOne;
                topRight = rowOne + colThree;
                bottomLeft = rowThree + colOne;
                bottomRight = rowThree + colThree;
                // 计算水平方向梯度
                gradientH = (float)(-3 * refPic[topLeft] + 3 * refPic[topRight] - 10 * refPic[rowTwo + colOne] + 10 * refPic[rowTwo + colThree] - 3 * refPic[bottomLeft] + 3 * refPic[bottomRight]);
                // 计算垂直方向梯度
                gradientV = (float)(-3 * refPic[topLeft] - 10 * refPic[rowOne + colTwo] - 3 * refPic[topRight] + 3 * refPic[bottomLeft] + 10 * refPic[rowThree + colTwo] + 3 * refPic[bottomRight]);
                // 计算总共的梯度大小，描述该位置的复杂度
                gradientMagnitude = sqrtf(gradientH * gradientH + gradientV * gradientV);
                if(bcalcTheta) 
                {
                    edgeTheta[middle] = 0;
                    radians = atan2(gradientV, gradientH);
                    theta = (float)((radians * 180) / PI);
                    if (theta < 0)
                       theta = 180 + theta;
                    edgeTheta[middle] = (pixel)theta;
                }
				// 如果梯度幅值超出了阈值，则赋值为whitePixel，whitePixel=1表示检测到了边界
                edgePic[middle] = (pixel)(gradientMagnitude >= EDGE_THRESHOLD ? whitePixel : blackPixel);
            }
        }
        return true;
    }
}

1.3 计算AC energy（acEnergyCu）

函数计算了一帧图像的AC分量

/* Find the total AC energy of each block in all planes */
uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize)
{
    intptr_t stride = curFrame->m_fencPic->m_stride;
    intptr_t cStride = curFrame->m_fencPic->m_strideC;
    intptr_t blockOffsetLuma = blockX + (blockY * stride);
    int hShift = CHROMA_H_SHIFT(csp);
    int vShift = CHROMA_V_SHIFT(csp);
    intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);

    uint32_t var;
	// 计算luma的AC分量
    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp, qgSize);
	// 计算chroma的AC分量
    if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
    {
        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp, qgSize);
        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp, qgSize);
    }
    x265_emms();
    return var;
}

计算单通道的AC energy

/* Find the energy of each block in Y/Cb/Cr plane */
inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat, uint32_t qgSize)
{
	// 计算chroma
    if ((colorFormat != X265_CSP_I444) && plane)
    {
        if (qgSize == 8) // 如果qgSize比较小，则使用4x4尺寸计算AC
        {
            ALIGN_VAR_4(pixel, pix[4 * 4]);
            primitives.cu[BLOCK_4x4].copy_pp(pix, 4, src, srcStride);
			// primitives.cu[BLOCK_4x4].var(pix, 4)表示计算图像的方差
            return acEnergyVar(curFrame, primitives.cu[BLOCK_4x4].var(pix, 4), 4, plane);
        }
        else
        {	// 使用8x8尺寸计算AC
            ALIGN_VAR_8(pixel, pix[8 * 8]);
            primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
        }
    }
    else
    {	// 计算luma
        if (qgSize == 8)
            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(src, srcStride), 6, plane);
        else
            return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
    }
}

acEnergyVar()的定义如下，获得每帧的AC energy

/* Compute variance to derive AC energy of each block */
inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
{
    uint32_t sum = (uint32_t)sum_ssd;
    uint32_t ssd = (uint32_t)(sum_ssd >> 32);

    curFrame->m_lowres.wp_sum[plane] += sum;
    curFrame->m_lowres.wp_ssd[plane] += ssd;
    return ssd - ((uint64_t)sum * sum >> shift);
}

1.4 计算梯度密度（edgeDensityCu）

如果当前的模式为X265_AQ_EDGE模式，则会计算梯度密度。具体来说，先寻找一个块的平均角度，随后计算每帧的AC energy

uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
{
    pixel *edgeImage = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
    intptr_t srcStride = curFrame->m_fencPic->m_stride;
    intptr_t blockOffsetLuma = blockX + (blockY * srcStride);
    int plane = 0; // Sobel filter is applied only on Y component
    uint32_t var;

    if (qgSize == 8)
    {
        findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, qgSize, avgAngle);
        var = acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(edgeImage + blockOffsetLuma, srcStride), 6, plane);
    }
    else
    {
    	// 寻找块的平均角度，通过求取平均值实现，这里的edgeTheta在edgeFilter()当中计算得到
        findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, 16, avgAngle);
        // 计算AC energy
        var = acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(edgeImage + blockOffsetLuma, srcStride), 8, plane);
    }
    x265_emms();
    return var;
}

findAvgAngle()的实现如下，通过求一个块中的平均像素角度来估算一个块的角度

//Find the angle of a block by averaging the pixel angles 
inline void findAvgAngle(const pixel* block, intptr_t stride, uint32_t size, uint32_t &angle)
{
    int sum = 0;
    for (uint32_t y = 0; y < size; y++)
    {
        for (uint32_t x = 0; x < size; x++)
        {
            sum += block[x];
        }
        block += stride;
    }
    angle = sum / (size*size);
}

2.宏块树（cuTree）

在x264当中，有mbtree这一项工具，用于提升宏块级编码效率。在x265当中也有类似的技术，叫做cuTree，两者差不多。具体来说，cuTree位于lookahead模块中，通过将lookahead队列中的帧按照从后向前的顺序进行分析，来获得前序帧中CU相对于后序帧中CU的重要程度，这里的后序帧CU会将前序帧CU作为参考CU。在主线程编码流程中，如果前序帧中的CU重要程度比较高，说明应该为其使用较低的QP（即高质量编码），这样后序帧中的CU就能够获得更好的编码效果

/*
	例如，队列中有3个P帧，主线程编码顺序为Pn-1，Pn，Pn+1，如下
	... Pn-1 -> Pn -> Pn+1 ...
	cuTree分析时的顺序为Pn+1，Pn，Pn-1
*/

cuTree的计算流程位于encoder\slicetype.cpp中，由cuTree()实现，主要工作流程为
（1）计算帧intra和inter cost（singleCost）
（2）计算CU的传播cost（estimateCUPropagate）
（3）根据前面计算的传播cost来评估qp调整量（cuTreeFinish）

void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra)
{
    int idx = !bIntra;
    int lastnonb, curnonb = 1;
    int bframes = 0;

    x265_emms();
    double totalDuration = 0.0;
    for (int j = 0; j <= numframes; j++)
        totalDuration += (double)m_param->fpsDenom / m_param->fpsNum;
	// 计算平均持续时间
    double averageDuration = totalDuration / (numframes + 1);

    int i = numframes;
	// 从后向前，寻找到第一个非B帧
    while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
        i--;

    lastnonb = i;

    /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
     * be applied to the end of a lookahead buffer of any size.  However, it's most needed when
     * lookahead=0, so that's what's currently implemented. */
	// 如果lookahead队列为空，将propagatecost和qpCuTreeOffset都初始化为0
    if (!m_param->lookaheadDepth)
    {
        if (bIntra)
        {
            memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
            if (m_param->rc.qgSize == 8)
                memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * 4 * sizeof(double));
            else
                memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
            return;
        }
        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
        memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
    }
    else
    {
        if (lastnonb < idx)
            return;
        memset(frames[lastnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
    }

    CostEstimateGroup estGroup(*this, frames);
	// 开始向前计算propagate cost
    while (i-- > idx)
    {
        curnonb = i;
		// 查找第一个非B帧，作为当前节点
        while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
            curnonb--;

        if (curnonb < idx)
            break;
		// 1.计算intra和inter cost
        estGroup.singleCost(curnonb, lastnonb, lastnonb);

        memset(frames[curnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
        bframes = lastnonb - curnonb - 1;
		// 是否使用金字塔模式
        if (m_param->bBPyramid && bframes > 1) 
        {
            int middle = (bframes + 1) / 2 + curnonb;
            estGroup.singleCost(curnonb, lastnonb, middle);
            memset(frames[middle]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
            while (i > curnonb)
            {
                int p0 = i > middle ? middle : curnonb;
                int p1 = i < middle ? middle : lastnonb;
                if (i != middle)
                {
                    estGroup.singleCost(p0, p1, i);
                    estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
                }
                i--;
            }

            estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
        }
        else
        {
			// i为一个B帧
            while (i > curnonb)
            {
                estGroup.singleCost(curnonb, lastnonb, i);
                estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
                i--;
            }
        }
		// 2.计算CU的传播cost
        estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
        lastnonb = curnonb;
    }

    if (!m_param->lookaheadDepth)
    {
        estGroup.singleCost(0, lastnonb, lastnonb);
        estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
    }
	// 3.根据前面计算的传播cost来评估qp调整量
    cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
    if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize)
        cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
}

2.1 计算帧损失（singleCost）

singleCost()中调用了estimateFrameCost()计算帧的cost

int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty)
{
    LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0];
    return estimateFrameCost(tld, p0, p1, b, intraPenalty);
}

estimateFrameCost()的定义如下

/*
	计算一帧的cost，其中
	p0表示前向参考帧位置，p1表示后向参考帧位置，b为当前帧位置
	若p0 = p1 = b,则表示没有参考帧，即I帧
	若p1 = b，则表示只有前向参考帧，即P帧

	作为I帧，所有宏块的cost = intra cost
	作为P帧，所有宏块的cost = min( intra cost, inter cost)
	作为B帧，所有宏块的cost = inter cost

	其中每一个帧都带有开销矩阵costEst[b-p0][p1-b]
	表示帧b以p0为前向参考，p1为后向参考时的帧cost
*/
int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty)
{
    Lowres*     fenc  = m_frames[b];
    x265_param* param = m_lookahead.m_param;
    int64_t     score = 0;

	// 是否已经存在cost
    if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
        score = fenc->costEst[b - p0][p1 - b];
    else
    {
        bool bDoSearch[2];
        bDoSearch[0] = fenc->lowresMvs[0][b - p0][0].x == 0x7FFF;			// 是否进行前向搜索
        bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFF; // 是否进行后向搜索

#if CHECKED_BUILD
        X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0][0].x == 0x7FFE), "motion search batch duplication L0\n");
        X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFE), "motion search batch duplication L1\n");
        if (bDoSearch[0]) fenc->lowresMvs[0][b - p0][0].x = 0x7FFE;
        if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b][0].x = 0x7FFE;
#endif

        fenc->weightedRef[b - p0].isWeighted = false;
		// 是否进行加权预测
        if (param->bEnableWeightedPred && bDoSearch[0])
            tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);

        fenc->costEst[b - p0][p1 - b] = 0;
        fenc->costEstAq[b - p0][p1 - b] = 0;
		// m_batchMode表示批量处理模式（即并行模式，默认会使用），这里是帧级别的并行
        if (!m_batchMode && m_lookahead.m_numCoopSlices > 1 && ((p1 > b) || bDoSearch[0] || bDoSearch[1]))
        {
            /* Use cooperative mode if a thread pool is available and the cost estimate is
             * going to need motion searches or bidir measurements */
			// 如果线程池可用，并且成本估计将需要运动搜索或边界测量，则使用合作模式
            memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);

            m_lock.acquire();
            X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");
            m_coop.p0 = p0;
            m_coop.p1 = p1;
            m_coop.b = b;
            m_coop.bDoSearch[0] = bDoSearch[0];
            m_coop.bDoSearch[1] = bDoSearch[1];
            m_jobTotal = m_lookahead.m_numCoopSlices;
            m_jobAcquired = 0;
            m_lock.release();

            tryBondPeers(*m_lookahead.m_pool, m_jobTotal);

            processTasks(-1);

            waitForExit();

            for (int i = 0; i < m_lookahead.m_numCoopSlices; i++)
            {
                fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst;
                fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq;
                if (p1 == b) // 没有后向参考，写入Intra cost
                    fenc->intraMbs[b - p0] += m_slice[i].intraMbs;
            }
        }
        else
        {
            /* Calculate MVs for 1/16th resolution*/
            bool lastRow;
            if (param->bEnableHME)
            {
                lastRow = true;
                for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)
                {
                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
                        estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);
                    lastRow = false;
                }
            }
			/*
				计算每个CU的cost
				（1）这里使用的是倒序的方式，从一帧的最右下角开始计算，在x264的slicetype_slice_cost函数中解释如下:
					MV在主编码过程中被用作预测器，通过倒序的方式能够有效提高MV预测的总体水平
				（2）我猜测这里可能有几方面的原因
					（a）视频特性
						一般情况下，视频具有向右、向下的一个趋势，人眼注意力也会倾向于关注偏右下角区域
					（b）信息参考
						先预测右下角，这使得左上角CU在进行预测时能够获取较多的信息参考，计算左上角CU的损失时更加准确，qp的调控也更加准确
						此时，在主编码流程中，如果左上角CU编码质量较高，整帧的编码质量都会较高
					（c）经验性
						可能按照这种配置，在大规模测试时，取得了不错的性能
			*/
            lastRow = true;
            for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
            {
                fenc->rowSatds[b - p0][p1 - b][cuY] = 0;

                for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);

                lastRow = false;
            }
        }

        score = fenc->costEst[b - p0][p1 - b];

        if (b != p1)
            score = score * 100 / (130 + param->bFrameBias);

        fenc->costEst[b - p0][p1 - b] = score;
    }

    if (bIntraPenalty)
        // arbitrary penalty for I-blocks after B-frames
        score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8);

    return score;
}

estimateCUCost()的定义如下

void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
{
    Lowres *fref0 = m_frames[p0];
    Lowres *fref1 = m_frames[p1];
    Lowres *fenc  = m_frames[b];

    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;
	// 如果是hme，块大小为4x4，否则为8x8
    const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
    const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
    const int bBidir = (b < p1);
    const int cuXY = cuX + cuY * widthInCU;
    const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
    const int cuSize = X265_LOWRES_CU_SIZE;
    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);

    if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
        tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
    else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);


    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
    int lowresPenalty = 4;
    int listDist[2] = { b - p0, p1 - b};

    MV mvmin, mvmax;
    int bcost = tld.me.COST_MAX;
    int listused = 0;

    // TODO: restrict to slices boundaries
    // establish search bounds that don't cross extended frame boundaries
    mvmin.x = (int32_t)(-cuX * cuSize - 8);
    mvmin.y = (int32_t)(-cuY * cuSize - 8);
    mvmax.x = (int32_t)((widthInCU - cuX - 1) * cuSize + 8);
    mvmax.y = (int32_t)((heightInCU - cuY - 1) * cuSize + 8);

    for (int i = 0; i < 1 + bBidir; i++)
    {
        int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
        int skipCost = INT_MAX;
		// 不进行搜索，直接对比cost
        if (!bDoSearch[i])
        {
            COPY2_IF_LT(bcost, fencCost, listused, i + 1);
            continue;
        }

        int numc = 0;
        MV mvc[5], mvp;
		// 如果使用hme搜索，则使用resmvs，即残差mv
        MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
        ReferencePlanes* fref = i ? fref1 : wfref0;

        /* Reverse-order MV prediction */
#define MVC(mv) mvc[numc++] = mv;
		// 将mv填充到mvc中
        if (cuX < widthInCU - 1)
            MVC(fencMV[1]);	// 填充右侧块的MV
        if (!lastRow)
        {
            MVC(fencMV[widthInCU]);	// 填充下方块的MV
            if (cuX > 0)
                MVC(fencMV[widthInCU - 1]);	// 填充左下方块的MV
            if (cuX < widthInCU - 1)
                MVC(fencMV[widthInCU + 1]);	// 填充右下方块的MV
        }
        if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
        {
            MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
        }
#undef MVC
		// 如果无可用mv，则mvp设置为0
        if (!numc)
            mvp = 0;
        else
        {
            ALIGN_VAR_32(pixel, subpelbuf[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
            int mvpcost = MotionEstimate::COST_MAX;

            /* measure SATD cost of each neighbor MV (estimating merge analysis)
             * and use the lowest cost MV as MVP (estimating AMVP). Since all
             * mvc[] candidates are measured here, none are passed to motionEstimate */
			// 对相邻mv评估SATD的损失，最佳的mv存储到mvp中
            for (int idx = 0; idx < numc; idx++)
            {
                intptr_t stride = X265_LOWRES_CU_SIZE;
				// 低分辨率的运动补偿
                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
				// 计算SATD
                int cost = tld.me.bufSATD(src, stride);
                COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
                /* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
                if (!mvp.notZero() && bBidir)
                    skipCost = cost;
            }
        }
		// 基于前面获取的最佳mv，进行运动估计
        int searchRange = m_lookahead.m_param->bEnableHME ? (hme ? m_lookahead.m_param->hmeRange[0] : m_lookahead.m_param->hmeRange[1]) : s_merange;
        /* ME will never return a cost larger than the cost @MVP, so we do not
         * have to check that ME cost is more than the estimated merge cost */
        if(!hme)
            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
        else
            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
        if (skipCost < 64 && skipCost < fencCost && bBidir)
        {
            fencCost = skipCost;
            *fencMV = 0;
        }
        COPY2_IF_LT(bcost, fencCost, listused, i + 1);
    }
    if (hme)
        return;

    if (bBidir) /* B, also consider bidir */
    {
        /* NOTE: the wfref0 (weightp) is not used for BIDIR */

        /* avg(l0-mv, l1-mv) candidate */
        ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
        ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
        int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
        COPY2_IF_LT(bcost, bicost, listused, 3);
        /* coloc candidate */
        src0 = fref0->lowresPlane[0] + pelOffset;
        src1 = fref1->lowresPlane[0] + pelOffset;
        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
        bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
        COPY2_IF_LT(bcost, bicost, listused, 3);
        bcost += lowresPenalty;
    }
    else /* P, also consider intra */
    {
        bcost += lowresPenalty;

        if (fenc->intraCost[cuXY] < bcost)
        {
            bcost = fenc->intraCost[cuXY];
            listused = 0;
        }
    }

    /* do not include edge blocks in the frame cost estimates, they are not very accurate */
	// 不能将边界块纳入计算，因为不准确
    const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
    int bcostAq;
    if (m_lookahead.m_param->rc.qgSize == 8)
        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
    else
        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;

    if (bFrameScoreCU)
    {
        if (slice < 0)
        {
            fenc->costEst[b - p0][p1 - b] += bcost;
            fenc->costEstAq[b - p0][p1 - b] += bcostAq;
            if (!listused && !bBidir)
                fenc->intraMbs[b - p0]++;
        }
        else
        {
            m_slice[slice].costEst += bcost;
            m_slice[slice].costEstAq += bcostAq;
            if (!listused && !bBidir)
                m_slice[slice].intraMbs++;
        }
    }

    fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
    fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}

2.2 计算传播损失（estimateCUPropagate）

函数的主要功能是根据前面获得的inter与intra cost来计算传播损失，传播损失计算的公式为
$(1-\frac{ interCost}{intraCost})$
这个公式表示的意思是：
（1）当前CU的cost，与propagateIn呈正相关（或者说别的CU赋予当前CU的重要程度）。如果别的CU认为当前CU很重要，则当前CU应该被认为是重要的，应该以较高质量编码

（2）当前CU的cost，与intraCost呈正相关。如果intraCost比较大，说明纹理比较复杂，应该以较高质量编码。qscale表示一个与qp相关的因子，可以理解是一个调控因子

（3）当前CU的cost，与fps（或者说视频持续时长）呈正相关。如果fps较大，从人眼视觉来说，当前帧比较重要

（4）当前CU的cost，与interCost和intraCost之间的关系有关，只有当interCost小于intraCost时，传播损失才大于0，并且只有传播损失大于0时才会被使用。如果interCost远小于intraCost，说明视频前后的图像很相似，使用Inter模式带来的损失很小，如果将当前CU以高质量编码，后续CU编码损失会很小。在这种情况下，传播损失会比较大，即当前CU的重要程度很高

PS：如果interCost大于intraCost，说明当前CU直接使用Intra模式效果更好，而cuTree是面向Inter模式的一种技术，这种情况下的传播损失设置为0

// 根据前面获得的inter 和 intra cost，计算CU级的传播cost
void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
{
	// 在lookahead当中进行帧计算时，使用的是经过下采样的低分辨率图像Lowres
    uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
	/*
		b表示当前帧
		p0表示b的前向参考帧
		p1表示b的后向参考帧
	*/
    int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
    int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
    int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
    int listDist[2] = { b - p0, p1 - b };

    memset(m_scratch, 0, m_8x8Width * sizeof(int));

    uint16_t *propagateCost = frames[b]->propagateCost;

    x265_emms();
    double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);

    /* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
    if (!referenced) // 如果当前帧没有被参考，则inter cost为0
        memset(frames[b]->propagateCost, 0, m_8x8Width * sizeof(uint16_t));

    int32_t strideInCU = m_8x8Width;
    for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++)
    {
        int cuIndex = blocky * strideInCU;
		/*
			逐行计算propagate cost，计算的结果存储在m_scratch中，这是一个并行操作，每次都会计算一行
			（1）propagateCost是其他帧传递给当前帧的cost，表示当前帧的重要程度
			（2）frames[b]->intraCost表示当前帧的intra cost
			（3）frames[b]->lowresCosts[b - p0][p1 - b]表示前一个参考帧传递给后一个参考帧的inter cost
			（4）frames[b]->invQscaleFactor表示invqscale，可以理解是与qp相关的影响因子
			（5）fpsFactor表示fps因子，如果当前帧持续时间长，说明当前帧比较重要
		*/
        if (m_param->rc.qgSize == 8)
            primitives.propagateCost(m_scratch, propagateCost,
                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                       frames[b]->invQscaleFactor8x8 + cuIndex, &fpsFactor, m_8x8Width);
        else // 使用x265_mbtree_propagate_cost_avx2实现行级计算
            primitives.propagateCost(m_scratch, propagateCost,
                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                       frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);

        if (referenced)
            propagateCost += m_8x8Width;

        for (uint16_t blockx = 0; blockx < m_8x8Width; blockx++, cuIndex++)
        {
            int32_t propagate_amount = m_scratch[blockx]; // 这里的propagate_amount就代表了cost
            /* Don't propagate for an intra block. */
            if (propagate_amount > 0) // intra block不传播
            {
                /* Access width-2 bitfield. */
                int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
                /* Follow the MVs to the previous frame(s). */
                for (uint16_t list = 0; list < 2; list++)
                {
                    if ((lists_used >> list) & 1)
                    {
#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) // 两个数字相加并返回给s，并检查是否上溢
                        int32_t listamount = propagate_amount;
                        /* Apply bipred weighting. */
                        if (lists_used == 3)
                            listamount = (listamount * bipredWeights[list] + 32) >> 6;

                        MV *mvs = frames[b]->lowresMvs[list][listDist[list]];

                        /* Early termination for simple case of mv0. */
                        if (!mvs[cuIndex].word) // mv为零，直接将cost写入
                        {
                            CLIP_ADD(refCosts[list][cuIndex], listamount);
                            continue;
                        }
						/*
							如果mv不为0，说明当前块传递给别的块（传播块）时，对应的传播块不是一个完整块，此时需要划分成4个块来考虑

							+ ---- + ---- +
							|	0  |   1  |
							+ ---- + ---- +
							|   2  |   3  |
							+ ---- + ---- +

							在内存中，mv的存储以1/4像素进行存储，要将mv中的x和y转换成为以8x8块为单位的坐标
							（1）x >> 2 表示将1/4像素转换成整像素
							（2）x >> 3 表示将整像素转换成为以8x8为单位的坐标
						*/
                        int32_t x = mvs[cuIndex].x;
                        int32_t y = mvs[cuIndex].y;
                        int32_t cux = (x >> 5) + blockx;		// 以8x8为单位块的横坐标
                        int32_t cuy = (y >> 5) + blocky;		// 以8x8为单位块的纵坐标
                        int32_t idx0 = cux + cuy * strideInCU;	// 0号块位置
                        int32_t idx1 = idx0 + 1;				// 1号块位置
                        int32_t idx2 = idx0 + strideInCU;		// 2号块位置
                        int32_t idx3 = idx0 + strideInCU + 1;	// 3号块位置
                        x &= 31;
                        y &= 31;
                        int32_t idx0weight = (32 - y) * (32 - x);	// 0号块权重
                        int32_t idx1weight = (32 - y) * x;			// 1号块权重
                        int32_t idx2weight = y * (32 - x);			// 2号块权重
                        int32_t idx3weight = y * x;					// 3号块权重

                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
                         * be counted. */
						/*
							检查对应的传播块是否超出边界
							（1）如果没有超出边界，则直接计算cost并赋值
							（2）如果有部分块超出了边界，则赋值可用的块
						*/
                        if (cux < m_8x8Width - 1 && cuy < m_8x8Height - 1 && cux >= 0 && cuy >= 0) // 所有块都没有超出边界
                        {
                            CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
                        }
                        else /* Check offsets individually */
                        {
							// 可能有部分块超出了边界
                            if (cux < m_8x8Width && cuy < m_8x8Height && cux >= 0 && cuy >= 0)					// idx0可用
                                CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
                            if (cux + 1 < m_8x8Width && cuy < m_8x8Height && cux + 1 >= 0 && cuy >= 0)			// idx1可用
                                CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
                            if (cux < m_8x8Width && cuy + 1 < m_8x8Height && cux >= 0 && cuy + 1 >= 0)			// idx2可用
                                CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
                            if (cux + 1 < m_8x8Width && cuy + 1 < m_8x8Height && cux + 1 >= 0 && cuy + 1 >= 0)	// idx3可用
                                CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);	
                        }
                    }
                }
            }
        }
    }

    if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced)
        cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
}

2.3 计算qp调整量（cuTreeFinish）

函数的主要功能是基于前面已经获取的传播损失，来计算当前CU的qp调整量

void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
{
	// 是否使用hevcAq模式
    if (m_param->rc.hevcAq)
    {
        computeCUTreeQpOffset(frame, averageDuration, ref0Distance);
    }
    else
    {	// 不使用hevcAq模式
        int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
        double weightdelta = 0.0;

        if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
            weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);

        if (m_param->rc.qgSize == 8)
        {
            for (int cuY = 0; cuY < m_8x8Height; cuY++)
            {
                for (int cuX = 0; cuX < m_8x8Width; cuX++)
                {
                    const int cuXY = cuX + cuY * m_8x8Width;
                    int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
                    if (intracost)
                    {
                        int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
                        double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
                    }
                }
            }
        }
        else
        {
            for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
            {
                int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
				// 当前CU纹理值得被参考时（intracost不为0），才计算传播cost
                if (intracost)
                {
					// 为每个cu块（以16x16为尺寸）赋值qpOffset
                    int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
                    double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
                    /*
                    	（1）m_cuTreeStrength的计算取决于是否使用hevcAq以及qCompress
						m_cuTreeStrength = (m_param->rc.hevcAq ? 6.0 : 5.0) * (1.0 - m_param->rc.qCompress);
						（2）log2_ratio的计算取决于propagateCost和intraCost的比值，propagateCost相比于intraCost而言越大，
						log2_ratio的值越大，frame->qpCuTreeOffset[cuIndex]就越小，此时应该以高质量编码
						
						PS: qpAqOffset的值可以为负，log2_ratio的值越大，则负的越多；如果qpAqOffset为正，log2_ratio的值越大，
							则正的越少。不论哪种情况，结果都是实际编码qp会更低
					*/
                    frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
                }
            }
        }
    }
}

3.qpOffset的使用

通过前面的AQ和cuTree获得了qpOffset等信息，在实际编码过程中会被使用到，粗略来说，可能有几种用法：
（1）直接使用qp进行CU级调整（例如calculateQpforCuSize）
（2）调整行级平均qp（与bOptCUDeltaQP相关）
（3）调整lowres的帧级cost，影响码控（与VBV相关，例如getEstimatedPictureCost）

在这几种使用中，简单记录一下calculateQpForCuSize()

int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
{
    FrameData& curEncData = *m_frame->m_encData;
    double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
    bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;

    if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
    {
        x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
        if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
            && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
            qp += distortionData->offset[ctu.m_cuAddr];
    }
	// analysisLoadReuseLevel默认为0
    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
    {
        int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
        if (ctu.m_slice->m_sliceType == I_SLICE)
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
        else
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
    }
	// 是否使用hevcAq
    if (m_param->rc.hevcAq)
    {
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
        double dQpOffset = 0;
        if (bCuTreeOffset)
        {
            dQpOffset = cuTreeQPOffset(ctu, cuGeom);
        }
        else
        {
            dQpOffset = aqQPOffset(ctu, cuGeom);
            if (complexCheck)
            {
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
                return (offset < max_threshold);
            }
        }
        qp += dQpOffset;
    }
    else
    {
        int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
        double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
        if (qpoffs)
        {
            uint32_t width = m_frame->m_fencPic->m_picWidth;
            uint32_t height = m_frame->m_fencPic->m_picHeight;
            uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
            uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
            uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
            uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
            double dQpOffset = 0;
            uint32_t cnt = 0;
            // 遍历16x16小块，从中取出原先计算好的qpOffset
            for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
            {
                for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
                {
                    uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
                    dQpOffset += qpoffs[idx];
                    cnt++;
                }
            }
            dQpOffset /= cnt;
            qp += dQpOffset; // 进行qp的调整
			// complexCheck默认为 -1
            if (complexCheck)	
            {
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
                return (offset < max_threshold);
            }
        }
    }
	// 对qp进行clip，防止溢出
    return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
}

4.AQ技术和cuTree技术之间的关联

AQ技术和cuTree技术都是CU级别的码率控制技术，其中AQ技术主要思想是基于帧内图像的空域相关性来调整qp，cuTree技术主要思想是基于帧间图像的时域相关性来调整qp。在实际编码过程中，通常是先计算AQ再计算cuTree，两者之间的影响关系如下所示，位于encoder\encoder.cpp中

/*
	Encoder::configure()，位于encoder.cpp中
	（1）如果不启用aq，但启用了cuTree，则会强制设置aqMode = X265_AQ_VARIANCE
	（2）如果aqStrength为0，同时不启用cuTree，则aqMode = X265_AQ_NONE
	（3）如果不启用aq和cuTree，则aqStrength = 0
*/
if (p->rc.aqMode == 0 && p->rc.cuTree)
{
    p->rc.aqMode = X265_AQ_VARIANCE;
    p->rc.aqStrength = 0.0;
}

if (p->rc.aqStrength == 0 && p->rc.cuTree == 0)
{
    p->rc.aqMode = X265_AQ_NONE;
    p->rc.hevcAq = 0;
}

if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0)
    p->rc.aqStrength = 0;

5.hevcAq模式

这个模式是后续提出来的，单独面向265标准的一种qp计算模式，默认不会启用。这种模式的主要思想是考虑一帧之内的纹理复杂度（用方差描述），利用单个块和整帧平均值来计算qpOffset。在这个模式下，复杂度的计算使用xPreanalyze()实现，qpOffset的计算使用computeCUTreeQpOffset()实现

5.1 复杂度的计算（xPreanalyze）

xPreanalyze()函数被calcAdaptiveQuantFrame()调用，简单来说就是按照不同的粒度对当前帧进行分析，分析的依据是方差，根据方差来对CU进行qp的调整

void LookaheadTLD::xPreanalyze(Frame* curFrame)
{
    const uint32_t width = curFrame->m_fencPic->m_picWidth;
    const uint32_t height = curFrame->m_fencPic->m_picHeight;
    const intptr_t stride = curFrame->m_fencPic->m_stride;
	// 1.按照不同的粒度，计算方差（或者说纹理复杂度）
    for (uint32_t d = 0; d < 4; d++)
    {
		// maxCUSize默认为64，ctuSizeIdx为0
        int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
		// qgSize默认为32，aqDepth为1
        int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
		/*
			对于aqLayerDepth[ctuSizeIdx][aqDepth][d]的理解是：
			（1）aqLayerDepth的定义如下
			static const uint32_t aqLayerDepth[3][4][4] = 
			{
				{  // ctu size 64
					{ 1, 0, 1, 0 },	aqDepth = 0
					{ 1, 1, 1, 0 }, aqDepth = 1
					{ 1, 1, 1, 0 }, aqDepth = 2
					{ 1, 1, 1, 1 }	aqDepth = 3
				},
				{  // ctu size 32
					{ 1, 1, 0, 0 },
					{ 1, 1, 0, 0 },
					{ 1, 1, 1, 0 },
					{ 0, 0, 0, 0 },
				},
				{  // ctu size 16
					{ 1, 0, 0, 0 },
					{ 1, 1, 0, 0 },
					{ 0, 0, 0, 0 },
					{ 0, 0, 0, 0 }
				}
			};
			（2）假设aqLayerDepth[0][1][0]=1，表示
				（a）ctuSizeIdx = 0，表示maxCUSize为64
				（b）aqDepth = 1，表示qgSize为32
				（c）d = 0，表示当前执行第0个级别粒度的分析

				aqLayerDepth[0][1] = {1, 1, 1, 0}表示允许进行第0,1,2级别粒度的分析，每种粒度会对应不同
					分析块的尺寸
		*/
        if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
            continue;
		
        const pixel* src = curFrame->m_fencPic->m_picOrg[0];; // 0表示luma分量
        PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[d]; // 获取当前粒度下的layer
        const uint32_t aqPartWidth = pQPLayer->aqPartWidth;		// 3种粒度，分别是{64, 32, 16}
        const uint32_t aqPartHeight = pQPLayer->aqPartHeight;	// 3种粒度，分别是{64, 32, 16}
        double* pcAQU = pQPLayer->dActivity;

        double dSumAct = 0.0;
		// 按照不同的粒度，来遍历一帧当中所有的块
        for (uint32_t y = 0; y < height; y += aqPartHeight)
        {
            const uint32_t currAQPartHeight = X265_MIN(aqPartHeight, height - y);
            for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++)
            {
                const uint32_t currAQPartWidth = X265_MIN(aqPartWidth, width - x);
                const pixel* pBlkY = &src[x];
                uint64_t sum[4] = { 0, 0, 0, 0 };
                uint64_t sumSq[4] = { 0, 0, 0, 0 };
                uint32_t by = 0;
				/*
					sum[4]中的4个索引分别表示4个子块
					+---+---+
					| 0 | 1 |
					+---+---+
					| 2 | 3 |
					+---+---+
				*/
                for (; by < currAQPartHeight >> 1; by++)
                {
                    uint32_t bx = 0;
                    for (; bx < currAQPartWidth >> 1; bx++)
                    {
                        sum[0] += pBlkY[bx];
                        sumSq[0] += pBlkY[bx] * pBlkY[bx];
                    }
                    for (; bx < currAQPartWidth; bx++)
                    {
                        sum[1] += pBlkY[bx];
                        sumSq[1] += pBlkY[bx] * pBlkY[bx];
                    }
                    pBlkY += stride;
                }
                for (; by < currAQPartHeight; by++)
                {
                    uint32_t bx = 0;
                    for (; bx < currAQPartWidth >> 1; bx++)
                    {
                        sum[2] += pBlkY[bx];
                        sumSq[2] += pBlkY[bx] * pBlkY[bx];
                    }
                    for (; bx < currAQPartWidth; bx++)
                    {
                        sum[3] += pBlkY[bx];
                        sumSq[3] += pBlkY[bx] * pBlkY[bx];
                    }
                    pBlkY += stride;
                }

                assert((currAQPartWidth & 1) == 0);
                assert((currAQPartHeight & 1) == 0);
                const uint32_t pixelWidthOfQuadrants = currAQPartWidth >> 1;
                const uint32_t pixelHeightOfQuadrants = currAQPartHeight >> 1;
				// 计算每个子块中像素的数量
                const uint32_t numPixInAQPart = pixelWidthOfQuadrants * pixelHeightOfQuadrants;

                double dMinVar = MAX_DOUBLE;
				// 求每个子块均值和方差
                if (numPixInAQPart != 0)
                {
                    for (int i = 0; i < 4; i++)
                    {
                        const double dAverage = double(sum[i]) / numPixInAQPart;
                        const double dVariance = double(sumSq[i]) / numPixInAQPart - dAverage * dAverage;
                        dMinVar = X265_MIN(dMinVar, dVariance);
                    }
                }
                else
                {
                    dMinVar = 0.0;
                }
                double dActivity = 1.0 + dMinVar;
				// 存储方差
                *pcAQU = dActivity;	// CU级方差
                dSumAct += dActivity;
            }
            src += stride * currAQPartHeight;
        }
		// 计算当前粒度下的均值方差
        const double dAvgAct = dSumAct / (pQPLayer->numAQPartInWidth * pQPLayer->numAQPartInHeight);
        pQPLayer->dAvgActivity = dAvgAct;
    }
	// 2.按照不同的粒度，去分析QP
    xPreanalyzeQp(curFrame);
	// 最小AQDepth（最细粒度）
    int minAQDepth = curFrame->m_lowres.pAQLayer->minAQDepth;

    PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[minAQDepth];
    const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
    const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
    double* pcQP = pQPLayer->dQpOffset;

    // Use new qp offset values for qpAqOffset, qpCuTreeOffset and invQscaleFactor buffer
	// 使用从最细粒度获取的pcQP值，来计算invQscaleFactor
	// 这里似乎没有计算qpAqOffset和qpCuTreeOffset?
    int blockXY = 0;
    for (uint32_t y = 0; y < height; y += aqPartHeight)
    {
        for (uint32_t x = 0; x < width; x += aqPartWidth, pcQP++)
        {
            curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(*pcQP);
            blockXY++;

            acEnergyCu(curFrame, x, y, curFrame->m_param->internalCsp, curFrame->m_param->rc.qgSize);
        }
    }
}

xPreanalyzeQP()的定义如下

void LookaheadTLD::xPreanalyzeQp(Frame* curFrame)
{
    const uint32_t width = curFrame->m_fencPic->m_picWidth;
    const uint32_t height = curFrame->m_fencPic->m_picHeight;

    for (uint32_t d = 0; d < 4; d++)
    {
        int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
        int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
        if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
            continue;

        PicQPAdaptationLayer* pcAQLayer = &curFrame->m_lowres.pAQLayer[d];
        const uint32_t aqPartWidth = pcAQLayer->aqPartWidth;
        const uint32_t aqPartHeight = pcAQLayer->aqPartHeight;
        double* pcAQU = pcAQLayer->dActivity;
        double* pcQP = pcAQLayer->dQpOffset;
        double* pcCuTree = pcAQLayer->dCuTreeOffset;
		// 分析每个粒度下的qpOffset
        for (uint32_t y = 0; y < height; y += aqPartHeight)
        {
            for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++, pcQP++, pcCuTree++)
            {
				// param->rc.qpAdaptationRange = 1.0;
                double dMaxQScale = pow(2.0, curFrame->m_param->rc.qpAdaptationRange / 6.0);
				// CU级别的方差
                double dCUAct = *pcAQU;
				// 粒度级别的方差（或者说CU级别的平均方差）
                double dAvgAct = pcAQLayer->dAvgActivity;
				/*
					dNormtAct计算的公式的含义是
					（1）对比当前CU方差和平均方差的大小
						（a）如果当前CU方差大于平均方差，则dNormAct偏大，qpOffset偏大，即当前CU很重要，应该以低qp（高质量）编码
					（2）举例
						（a）已知 dMaxQScale = pow(2, 1/6) = 1.1224
						（b）假设dCUAct = 10，dAvgAct = 5，表明当前CU的方差比较大，此时
							dNormAct = (1.124 * 10 + 5) / (1.124 * 5 + 10) = 1.040
						（c）假设dCUAct = 5，dAvgAct = 10, 表明当前CU的方差比较小，此时
							dNormAct = (1.124 * 5 + 10) / (1.124 * 10 + 5) = 0.962
						
						dNormAct越大，qpOffset越大，所以当dCUAct越大，qpOffset越大
				*/
                double dNormAct = (dMaxQScale*dCUAct + dAvgAct) / (dCUAct + dMaxQScale*dAvgAct);
                double dQpOffset = (X265_LOG2(dNormAct) / X265_LOG2(2.0)) * 6.0;
				// 存储qpOffset
                *pcQP = dQpOffset;
                *pcCuTree = dQpOffset;
            }
        }
    }
}

5.2 qpOffset的计算（computeCUTreeQpOffset）

computeCUTreeQpOffset()被cuTreeFinish()调用，实现了hevcAq模式下的qpOffset的计算，其定义如下

void Lookahead::computeCUTreeQpOffset(Lowres *frame, double averageDuration, int ref0Distance)
{
    int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;

    double weightdelta = 0.0;
    if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
        weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);

    uint32_t widthFullRes = frame->widthFullRes;
    uint32_t heightFullRes = frame->heightFullRes;

    if (m_param->rc.qgSize == 8)
    {
        // ... 
    }
    else
    {	// 遍历每个粒度
        for (uint32_t d = 0; d < 4; d++)
        {
            int ctuSizeIdx = 6 - g_log2Size[m_param->maxCUSize];
            int aqDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
            if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
                continue;

            PicQPAdaptationLayer* pQPLayer = &frame->pAQLayer[d];
            const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
            const uint32_t aqPartHeight = pQPLayer->aqPartHeight;

            const uint32_t numAQPartInWidth = pQPLayer->numAQPartInWidth;
            const uint32_t numAQPartInHeight = pQPLayer->numAQPartInHeight;

            double* pcQP = pQPLayer->dQpOffset;
            double* pcCuTree = pQPLayer->dCuTreeOffset;

            uint32_t maxCols = frame->maxBlocksInRow;

            for (uint32_t y = 0; y < numAQPartInHeight; y++)
            {
                for (uint32_t x = 0; x < numAQPartInWidth; x++, pcQP++, pcCuTree++)
                {
                    uint32_t block_x = x * aqPartWidth;
                    uint32_t block_y = y * aqPartHeight;

                    uint32_t blockXY = 0;
                    double log2_ratio = 0;
                    for (uint32_t block_yy = block_y; block_yy < block_y + aqPartHeight && block_yy < heightFullRes; block_yy += loopIncr)
                    {
                        for (uint32_t block_xx = block_x; block_xx < block_x + aqPartWidth && block_xx < widthFullRes; block_xx += loopIncr)
                        {
                            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);

                            int intraCost = (frame->intraCost[idx] * frame->invQscaleFactor[idx] + 128) >> 8;
                            int propagateCost = (frame->propagateCost[idx] * fpsFactor + 128) >> 8;

                            log2_ratio += (X265_LOG2(intraCost + propagateCost) - X265_LOG2(intraCost) + weightdelta);

                            blockXY++;
                        }
                    }
					// 计算qpOffset
                    double qp_offset = (m_cuTreeStrength * log2_ratio) / blockXY;

                    *pcCuTree = *pcQP - qp_offset;

                }
            }
        }
    }
}