


1. h265标准帧内预测概述

1.1 帧内预测编码结构

在h265标准中,CU的尺寸最大为64x64,最小为8x8,以四叉树(QuadTree,QT)向下逐层划分,同时引入了预测单元(Prediction Unit,PU)这一概念,用于预测编码。PU由CU划分成为多个预测区域而来,对于帧内预测而言,PU的尺寸可以分为两类:

// 2Nx2N
|         |
+         +
|         |

// NxN
|    |    |
|    |    |

1.2 帧内预测模式

1.2.1 角度模式(模式2~34)


1.2.2 Planar模式(模式0)


1.2.3 DC模式(模式1)


1.2 亮度模式的编码

相对比于H264标准,H265中使用的帧内预测模式达到了35种,为了提高实际编码速度,引入了最可能模式(Most Probable Mode,MPM)概念,通过考虑空间域信息,即相邻已编码块的信息来提高编码速度。这样做的依据是,相邻块之间往往具有类似的纹理特征,其编码模式很有可能相同或者接近。具体来说,mpm包含了3个候选模式,这3个候选模式来自于相邻参考块,分别来自于左侧和上方参考块,如下所示,其中c为当前待编码块,a和b为已编码块

|     |  b  |
|  a  |  c  |

 (i)a和b都为Planar或DC模式,则mpm = { Planar, DC, 26 }
 (ii)a和b都为角度模式,则mpm = { ModeA, ModeA - 1, ModeA + 1};这里需要注意模式2与模式3和模式33相邻,模式34与模式33和模式3相邻
(2)如果a和b模式不同,则mpm = { modeA, modeB, X },其中X分几种情况决定

 (ii)遍历mpm中的候选模式,分别与modeC进行比较,如果modeC >= mpm[i],则将modeC自减1,随后对modeC最终的值进行编码

1.3 色度模式编码


(1)如果亮度预测模式modeLuma不是前4种中的一种,则直接对模式编号进行编码,此时色度模式参考队列为 modeChroma = { Planar, 26, 10, DC, modeLuma },其中Planar模式对应的是编码模式0号,角度模式26对应的是编码模式2号



2. 帧内预测入口函数(compressIntraCU)

(2)检查当前CU是否有确定的intra模式和depth(前两步具有early termination的思想)


uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
    uint32_t depth = cuGeom.depth;
    ModeDepth& md = m_modeDepth[depth];
    md.bestMode = NULL;
		1. 检查当前CU是否有可能继续划分
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);

		2. 检查当前CU是否有确定的dir和depth
		(1)如果CU已有对应的intra dir,并且分析类型不为HEVC_INFO,表示已确定了intra dir
		PS: intraRefine默认为0
    bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
    bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
    int split = 0;
    if (m_param->intraRefine && m_param->intraRefine != 4)
        split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
            ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
        if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
            bAlreadyDecided = false;

    if (bAlreadyDecided)
        if (bDecidedDepth && mightNotSplit)
            Mode& mode = md.pred[0];
            md.bestMode = &mode;
            mode.cu.initSubCU(parentCTU, cuGeom, qp);
            bool reuseModes = !((m_param->intraRefine == 3) ||
                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
            if (reuseModes)
                memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
                memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
			// 尝试无损
            if (m_bTryLossless)

            if (mightSplit)
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
        // 3. 帧内预测执行入口
        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
        // 4. 检查最佳模式
        checkBestMode(md.pred[PRED_INTRA], depth);
		// 如果当前CU尺寸为8x8,PU可以拆分为4x4进行分析
        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN); // 分析的partSize设置为SIZE_NxN
            checkBestMode(md.pred[PRED_INTRA_NxN], depth);

        if (m_bTryLossless)

        if (mightSplit)
            addSplitFlagCost(*md.bestMode, cuGeom.depth);

	// 检查当前的split depth是否已经达到之前决定的split depth(这里包含了early termination思想)
    // stop recursion if we reach the depth of previous analysis decision
    mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
	// 5. 根据情况划分成为子块进行预测编码
    if (mightSplit)
        Mode* splitPred = &md.pred[PRED_SPLIT];
        CUData* splitCU = &splitPred->cu;
		// 初始化子块
        splitCU->initSubCU(parentCTU, cuGeom, qp);

        uint32_t nextDepth = depth + 1; // 子块depth + 1
        ModeDepth& nd = m_modeDepth[nextDepth];
        invalidateContexts(nextDepth); // 将nextDepth的上下文设置为invalid
        Entropy* nextContext = &m_rqt[depth].cur;
        int32_t nextQP = qp;
        uint64_t curCost = 0;
        int skipSplitCheck = 0;
		// 分成4个子块进行编码
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
			// 获取child的CU信息
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
            if (childGeom.flags & CUGeom::PRESENT)
				// 将sub-CU的yuv信息拷贝到nd.fencYuv,用于后续分析
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);

				// 检查是否需要为sub-CU调整QP
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));

                if (m_param->bEnableSplitRdSkip)
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP);
                    if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
                        skipSplitCheck = 1;
                else // 开始执行sub-CU的分析
                    compressIntraCU(parentCTU, childGeom, nextQP);

                // Save best CU and pred data for this sub CU
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                nextContext = &nd.bestMode->contexts;
                /* record the depth of this non-present sub-CU */
                splitCU->setEmptyPart(childGeom, subPartIdx);

                /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
                if (bAlreadyDecided)
                    memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
        if (!skipSplitCheck)
            if (mightNotSplit)
                addSplitFlagCost(*splitPred, cuGeom.depth);

            checkDQPForSplitPred(*splitPred, cuGeom);
            checkBestMode(*splitPred, depth);

    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
        int cuIdx = (cuGeom.childOffset - 1) / 3;
        cacheCost[cuIdx] = md.bestMode->rdCost;

    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
        int8_t maxTUDepth = -1;
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
            maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
	// 6. 存储最佳数据
    /* Copy best data to encData CTU and recon */
    if (md.bestMode != &md.pred[PRED_SPLIT])
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);

    return md.bestMode->rdCost;

2.1 帧内预测入口函数(checkIntra)

(1)一些信息的初始化(partSize、intra mode和costs)


void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
    CUData& cu = intraMode.cu;
	// 1. 一些信息的初始化

    uint32_t tuDepthRange[2];
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);

	// 2. 计算luma分量的帧内预测损失
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
    if (m_csp != X265_CSP_I400) // 如果不是400格式,即还有色度分量,计算度分量的帧内预测损失
    	// 3. 计算chroma分量的帧内预测损失
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
		// 将luma和chroma分量的损失相加,得到总体的distortion
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
        intraMode.distortion += intraMode.lumaDistortion;
    // 4. 对一些信息进行编码
	// 索引号为0表示当前树结构的最顶层CU损失
    cu.m_distortion[0] = intraMode.distortion;
	// pps->bTransquantBypassEnabled = m_param->bCULossless || m_param->bLossless;
	// bTransquantBypassEnabled由Lossless(无损)参数决定是否启用,默认不启用
    if (m_slice->m_pps->bTransquantBypassEnabled)

    int skipFlagBits = 0;
    if (!m_slice->isIntra()) // 当前slice不是intra,因为有的非slice块当中可能有些CU进行intra编码
        m_entropyCoder.codeSkipFlag(cu, 0); // 编码skip flag
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); // 获取skip flag对应的bits
        m_entropyCoder.codePredMode(cu.m_predMode[0]);	// 编码预测模式
	// 编码划分size
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); 
    // 编码预测信息
	m_entropyCoder.codePredInfo(cu, 0);
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;

    bool bCodeDQP = m_slice->m_pps->bUseDQP;
	// 编码残差系数
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
	// 计算总比特
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
	// 计算残差系数比特
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
    const Yuv* fencYuv = intraMode.fencYuv;
    // 4. 计算psy的开销
	// 基于心理视觉的Rd优化
    if (m_rdCost.m_psyRd)
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
    else if(m_rdCost.m_ssimRd) // 基于心理视觉的ssimRd优化
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
	// 计算SSE,获得运动预测后的残差能量之和
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
	// 更新RdCost
	// 5. 检查DQP
    checkDQP(intraMode, cuGeom);

2.1.1 计算亮度分量的帧内预测损失(estIntraPredQT)


sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
    CUData& cu = intraMode.cu;
    Yuv* reconYuv = &intraMode.reconYuv;
    Yuv* predYuv = &intraMode.predYuv;
    const Yuv* fencYuv = intraMode.fencYuv;

    uint32_t depth        = cuGeom.depth;
	// 如果partSize为SIZE_2Nx2N,即PU尺寸与当前CU尺寸相同,所以PU数量numPU为1
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
    uint32_t numPU        = 1 << (2 * initTuDepth);
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
    uint32_t tuSize       = 1 << log2TrSize;
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
    uint32_t sizeIdx      = log2TrSize - 2;
    uint32_t absPartIdx   = 0;
    sse_t totalDistortion = 0;
	// 是否跳过变换过程,只有无损模式下为true
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;

	// 开始进行PU级别的预测划分
    // loop over partitions
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
        uint32_t bmode = 0;
		// 检查是否已经指定了帧内预测模式
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
        {	// 如果没有指定intra dir,则检查最佳的帧内预测模式
            uint64_t candCostList[MAX_RD_INTRA_MODES];
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
            uint64_t bcost;
			// rdLevel表示使用什么水平的RDO,默认为3
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);

                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);

                // Reference sample smoothing
                IntraNeighbors intraNeighbors;
				// 1. 初始化intra的相邻CU
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
				// 2. 填充相邻像素并平滑滤波
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);

                // determine set of modes to be tested (using prediction signal only)
				// 获取fenc和stride
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
                uint32_t stride = predYuv->m_size;

                int scaleTuSize = tuSize;
                int scaleStride = stride;
                int costShift = 0;
				// 从先前编码的信息中加载亮度分量(Luma)的帧内预测方向(Intra direction mode)

                /* there are three cost tiers for intra modes:
                *  pred[0]          - mode probable, least cost
                *  pred[1], pred[2] - less probable, slightly more cost
                *  non-mpm modes    - all cost the same (rbits) */
					3. 获取mpm(most probable mode)
					例如 67,108,867,即 ‭0100 0000 0000 0000 0000 0000 0011‬
					其中,低2位分别表示DC(第1位)和Planar(第0位),最高的 0100 表示第26位,即角度编号为26的模式位于mpm中
                uint64_t mpms;
                uint32_t mpmModes[3];
				// 获得mpms,如果没有得到mpm,会返回bits开销
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
				// 获得计算sad的函数指针
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
                uint64_t modeCosts[35];
					4. 选择最佳的帧内预测模式
                // DC
				// 进行DC模式的预测
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
				// 如果mpm中有DC模式,则获取DC模式对应的比特开销
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
				// 计算SAD Cost
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);

                // PLANAR
				// intraNeighbourBuf[0] 表示未经过平滑的像素
                pixel* planar = intraNeighbourBuf[0];
                if (tuSize >= 8 && tuSize <= 32)
                    planar = intraNeighbourBuf[1];	// intraNeighbourBuf[1] 表示经过平滑的像素
				// 进行Planar模式的预测
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);

                // angular predictions
					初始化函数为ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
                if (primitives.cu[sizeIdx].intra_pred_allangs)
						初始化函数为ALL_LUMA_CU_S(transpose, transpose, avx2);
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
					// 角度模式的预测
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
                    for (int mode = 2; mode < 35; mode++)
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
						// mode < 18,预测方向偏水平
                        if (mode < 18)
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                        else // mode > 18,预测方向偏垂直
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                        // 计算Rdcost
						modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                        COPY1_IF_LT(bcost, modeCosts[mode]);
                {	// 单独进行每种模式的预测
                    for (int mode = 2; mode < 35; mode++)
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                        COPY1_IF_LT(bcost, modeCosts[mode]);

                /* Find the top maxCandCount candidate modes with cost within 25% of best
                * or among the most probable modes. maxCandCount is derived from the
                * rdLevel and depth. In general we want to try more modes at slower RD
                * levels and at higher depths */

					e.g. CUSize=32, rdLevel=3, maxCandCount=5
                for (int i = 0; i < maxCandCount; i++)
                    candCostList[i] = MAX_INT64;

                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
                for (int mode = 0; mode < 35; mode++)
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);

            /* measure best candidates using simple RDO (no TU splits) */
			// 进行RDO来选择最佳的候选模式
            bcost = MAX_INT64;
            for (int i = 0; i < maxCandCount; i++)
                if (candCostList[i] == MAX_INT64)

                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);

                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);

                Cost icosts;
				// 检查是否跳过transform
                if (checkTransformSkip)
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
                else // 不跳过,进行基于SSE的模式选择
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);

        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);

        /* remeasure best mode, allowing TU splits */
		// 重新评估最佳模式,这时允许TU向下划分
		// 将最佳模式设置到各个子块中
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);

        Cost icosts;
        if (checkTransformSkip)
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
        else // TU split enable设置为true
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
        totalDistortion += icosts.distortion;
		// 存储系数和重建帧
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
		// 存储重建帧,用于后续block进行帧内预测
        // set reconstruction for next intra prediction blocks
        if (puIdx != numPU - 1)
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
             * that the contexts should be tracked through each PU */
            PicYuv*  reconPic = m_frame->m_reconPic;
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
            uint32_t dststride = reconPic->m_stride;
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
            uint32_t srcstride = reconYuv->m_size;
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);

    if (numPU > 1)
        uint32_t combCbfY = 0;
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);

        cu.m_cbf[0][0] |= combCbfY;

    // TODO: remove this

    return totalDistortion;
} 初始化相邻块(initNeighbors)




// block
| a0 | a1 | b0 | b1 |
| a2 | a3 | b2 | b3 |

// zscan number
| 0 | 1 | 4 | 5 |
| 2 | 3 | 6 | 7 |




void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
    int log2UnitWidth = LOG2_UNIT_SIZE;
    int log2UnitHeight = LOG2_UNIT_SIZE;

	// chroma分量
    if (!isLuma)
        log2TrSize -= cu.m_hChromaShift;
        log2UnitWidth -= cu.m_hChromaShift;
        log2UnitHeight -= cu.m_vChromaShift;

    int numIntraNeighbor; // 可用neighbor数量
	// bNeighborFlags表示相邻块是否可用
    bool* bNeighborFlags = intraNeighbors->bNeighborFlags;

    uint32_t tuSize = 1 << log2TrSize;
    int  tuWidthInUnits = tuSize >> log2UnitWidth;
    int  tuHeightInUnits = tuSize >> log2UnitHeight;
		aboveUnits = 2 x tuWidthInUnits
		leftUnits = 2 x tuHeightInUnits

		(1)正上方 + 右上方 = 8 + 8 = 16
		(2)左侧 + 左下角 = 8 + 8 = 16
    int  aboveUnits = tuWidthInUnits << 1;
    int  leftUnits = tuHeightInUnits << 1;
	// left top
    uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
	// right top
	// 先将zig-zag转换成raster,再将raster转换成zig-zag
    uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + tuWidthInUnits - 1];
	// left bottom
    uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) << LOG2_RASTER_SIZE)];

		bConstrainedIntraPred表示对intra pred进行的限制
		(1)bConstrainedIntraPred = 1,表示只有相邻块也使用intra prediction时,才会被用于参考
		(2)bConstrainedIntraPred = 0,不添加限制
		如果bConstrainedIntraPred = 1,对应下面的函数中使用isAboveAvailable<false>,否则对应isAboveAvailable<true>
    if (cu.m_slice->isIntra() || !cu.m_slice->m_pps->bConstrainedIntraPred)
		// 检查左上角块是否可用
        bNeighborFlags[leftUnits] = isAboveLeftAvailable<false>(cu, partIdxLT);
        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);


			leftUnits(16) + 1 = 左下块数量(8) + 左侧块数量(8) + 左上角块(1) = 17
        numIntraNeighbor += isAboveAvailable<false>(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
        numIntraNeighbor += isAboveRightAvailable<false>(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
        numIntraNeighbor += isLeftAvailable<false>(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
        numIntraNeighbor += isBelowLeftAvailable<false>(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
    {	// 相邻块必须为intra模式才能够用于参考
        bNeighborFlags[leftUnits] = isAboveLeftAvailable<true>(cu, partIdxLT);
        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
        numIntraNeighbor += isAboveAvailable<true>(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
        numIntraNeighbor += isAboveRightAvailable<true>(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
        numIntraNeighbor += isLeftAvailable<true>(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
        numIntraNeighbor += isBelowLeftAvailable<true>(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);

    intraNeighbors->numIntraNeighbor = numIntraNeighbor;		// 可用相邻块总量
    intraNeighbors->totalUnits = aboveUnits + leftUnits + 1;	// 总共unit数量(unit一定存在,但这个unit不一定可用)
    intraNeighbors->aboveUnits = aboveUnits;					// 上方unit数量
    intraNeighbors->leftUnits = leftUnits;						// 左侧unit数量
    intraNeighbors->unitWidth = 1 << log2UnitWidth;				// unit宽度
    intraNeighbors->unitHeight = 1 << log2UnitHeight;			// unit高度
    intraNeighbors->log2TrSize = log2TrSize;					// 以log2为底的TU Size
} 检查上方块是否可用(isAboveAvailable)
template<bool cip>
int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
	// 先将zig-zag顺序换成raster顺序
    const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
    const uint32_t idxStep = 1;
    int numIntra = 0;
	// 依次检查上方各个块是否可用
    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags++)
        uint32_t partAbove;
        const CUData* cuAbove = cu.getPUAbove(partAbove, g_rasterToZscan[rasterPart]);
		// 找到可用块则将该位置标记为true,否则false
        if (cuAbove && (!cip || cuAbove->isIntra(partAbove)))
            *bValidFlags = true;
            *bValidFlags = false;
	// 上方可用块的数量
    return numIntra;


const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const
    uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
	// 检查PU是否位于第1行,如果是第1行,上方不会存在可用块
    if (!isZeroRow(absPartIdx))
        uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU];
        aPartUnitIdx = g_rasterToZscan[absPartIdx - RASTER_SIZE];
        if (isEqualRow(absPartIdx, absZorderCUIdx))
            return m_encData->getPicCTU(m_cuAddr);
            aPartUnitIdx -= m_absIdxInCTU;
        return this;

    aPartUnitIdx = g_rasterToZscan[absPartIdx + ((s_numPartInCUSize - 1) << LOG2_RASTER_SIZE)];
    return m_cuAbove;
} 检查右上方块是否可用(isAboveRightAvailable)
template<bool cip>
int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits)
    int numIntra = 0;
	// 依次检查右上方的块是否可用
    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
        uint32_t partAboveRight;
        const CUData* cuAboveRight = cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
		// 如果检查出来块可用,则标记为true,否则false
        if (cuAboveRight && (!cip || cuAboveRight->isIntra(partAboveRight)))
            *bValidFlags = true;
            *bValidFlags = false;

    return numIntra;


const CUData* CUData::getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const
	// 检查右上角所对应的位置是否可能超出整张图片的边界(例如1280、1920)
    if ((m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[curPartUnitIdx] + (partUnitOffset << LOG2_UNIT_SIZE)) >= m_slice->m_sps->picWidthInLumaSamples)
        return NULL;

    uint32_t absPartIdxRT = g_zscanToRaster[curPartUnitIdx];
		检查absPartIdxRT所在一列是否小于s_numPartInCUSize - partUnitOffset对应一列
    if (lessThanCol(absPartIdxRT, s_numPartInCUSize - partUnitOffset))
		// 如果是第0行,右上没有可用的块
        if (!isZeroRow(absPartIdxRT))
            if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - RASTER_SIZE + partUnitOffset])
            	// 计算当前PU右上角PU位置(以raster顺序呈现)
                uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
                // 计算当前PU右上角PU位置(以zscan顺序呈现)
                arPartUnitIdx = g_rasterToZscan[absPartIdxRT - RASTER_SIZE + partUnitOffset];
						计算的idx表示右上角PU位于当前CU的相对位置(例如8x8 CU的右上角PU的idx为1)
                if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx))
                    return m_encData->getPicCTU(m_cuAddr);
                    arPartUnitIdx -= m_absIdxInCTU;
                    return this;
            return NULL;
        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + ((s_numPartInCUSize - 1) << LOG2_RASTER_SIZE) + partUnitOffset];
        return m_cuAbove;

    if (!isZeroRow(absPartIdxRT))
        return NULL;

    arPartUnitIdx = g_rasterToZscan[((s_numPartInCUSize - 1) << LOG2_RASTER_SIZE) + partUnitOffset - 1];
    return m_cuAboveRight;
} 检查左侧块是否可用(isLeftAvailable)
template<bool cip>
int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
    const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
    const uint32_t idxStep = RASTER_SIZE; // idxStep = 16
    int numIntra = 0;
	// 依次检查左侧块是否可用,这里flags的索引号是从左上角开始向左下角移动的
    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
        uint32_t partLeft;
        const CUData* cuLeft = cu.getPULeft(partLeft, g_rasterToZscan[rasterPart]);
		// 找到左侧可用块,则置为true,否则false
        if (cuLeft && (!cip || cuLeft->isIntra(partLeft)))
            *bValidFlags = true;
            *bValidFlags = false;

    return numIntra;


const CUData* CUData::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const
    uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];

	// 检查当前PU是否位于当前CU的第一列
    if (!isZeroCol(absPartIdx)) 
    	// 获取当前CU的位置(或者说当前CU的第一个PU地址)
        uint32_t absZorderCUIdx   = g_zscanToRaster[m_absIdxInCTU];
		// 获取当前PU左侧的块的idx,并转换成zscan顺序
        lPartUnitIdx = g_rasterToZscan[absPartIdx - 1];
				例如lPartUnitIdx为3,表示当前CU中编号为3的PU(一共4个,编号分别为={0, 1, 2, 3})
        if (isEqualCol(absPartIdx, absZorderCUIdx))
            return m_encData->getPicCTU(m_cuAddr);
            lPartUnitIdx -= m_absIdxInCTU;
            return this;
	// 返回左侧CTU的idx
    lPartUnitIdx = g_rasterToZscan[absPartIdx + s_numPartInCUSize - 1];
    return m_cuLeft;
} 检查左下块是否可用(isBelowLeftAvailable)
template<bool cip>
int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits)
    int numIntra = 0;
    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags--) // opposite direction
        uint32_t partBelowLeft;
        const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft, partIdxLB, offset);
        if (cuBelowLeft && (!cip || cuBelowLeft->isIntra(partBelowLeft)))
            *bValidFlags = true;
            *bValidFlags = false;

    return numIntra;


const CUData* CUData::getPUBelowLeftAdi(uint32_t& blPartUnitIdx,  uint32_t curPartUnitIdx, uint32_t partUnitOffset) const
	// 检查是否超出了图像边界
    if ((m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[curPartUnitIdx] + (partUnitOffset << LOG2_UNIT_SIZE)) >= m_slice->m_sps->picHeightInLumaSamples)
        return NULL;

    uint32_t absPartIdxLB = g_zscanToRaster[curPartUnitIdx];
		检查absPartIdxLB所对应的row,是否小于s_numPartInCUSize - partUnitOffset

    if (lessThanRow(absPartIdxLB, s_numPartInCUSize - partUnitOffset))
		// 检查是否是第0列
        if (!isZeroCol(absPartIdxLB))
            if (curPartUnitIdx > g_rasterToZscan[absPartIdxLB + (partUnitOffset << LOG2_RASTER_SIZE) - 1])
                uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInCTU] + (((1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1) << LOG2_RASTER_SIZE);
                blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (partUnitOffset << LOG2_RASTER_SIZE) - 1];
                if (isEqualRowOrCol(absPartIdxLB, absZorderCUIdxLB))
                    return m_encData->getPicCTU(m_cuAddr);
                    blPartUnitIdx -= m_absIdxInCTU;
                    return this;
            return NULL;
        blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (partUnitOffset << LOG2_RASTER_SIZE) + s_numPartInCUSize - 1];
        return m_cuLeft;

	// 如果给定的索引号超出了CTU的范围,设置为NULL
    return NULL;
} 填充相邻块及平滑滤波(initNeighbors)
void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
    int tuSize = 1 << intraNeighbors.log2TrSize;
    int tuSize2 = tuSize << 1;

    PicYuv* reconPic = cu.m_encData->m_reconPic;
	// 获取PU的
    pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
    intptr_t picStride = reconPic->m_stride;
	// 填充参考像素
    fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);

    pixel* refBuf = intraNeighbourBuf[0]; // 填充的像素
    pixel* fltBuf = intraNeighbourBuf[1]; // 滤波的像素

    pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
	// 检查是否进行强滤波
    if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
        // generate filtered intra prediction samples
		// 检查是否使用strong intra smooth,tu尺寸必须为32
        if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32)
            const int threshold = 1 << (X265_DEPTH - 5); // threshold = 8
            pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
			// 检查两侧像素值变化是否小于阈值
            if (abs(topLeft + topLast  - (topMiddle  << 1)) < threshold &&
                abs(topLeft + leftLast - (leftMiddle << 1)) < threshold)
                // "strong" bilinear interpolation
				// 进行强双线性插值
                const int shift = 5 + 1;
                int init = (topLeft << shift) + tuSize;
                int deltaL, deltaR;

                deltaL = leftLast - topLeft; deltaR = topLast - topLeft;

                fltBuf[0] = topLeft;
                for (int i = 1; i < tuSize2; i++)
						left = a + 1/2 + [(e - a) * i] / 64
						top = a + 1/2 + [(c - a) * i] / 64
                    fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
                    fltBuf[i] = (pixel)((init + deltaR * i) >> shift);           // Above Filtering
                fltBuf[tuSize2] = topLast;
                fltBuf[tuSize2 + tuSize2] = leftLast;
		// 执行普通的滤波
        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
} 填充参考像素(fillReferenceSamples)
void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258])
    const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
    int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
    int totalUnits = intraNeighbors.totalUnits;
    uint32_t tuSize = 1 << intraNeighbors.log2TrSize;
    uint32_t refSize = tuSize * 2 + 1;


    // Nothing is available, perform DC prediction.
    if (numIntraNeighbor == 0)
        // Fill top border with DC value
        for (uint32_t i = 0; i < refSize; i++)
            dst[i] = dcValue;

        // Fill left border with DC value
        for (uint32_t i = 0; i < refSize - 1; i++)
            dst[i + refSize] = dcValue;
    else if (numIntraNeighbor == totalUnits) // 所有块都可用,使用重建帧像素填充border
        // Fill top border with rec. samples
        const pixel* adiTemp = adiOrigin - picStride - 1;
        memcpy(dst, adiTemp, refSize * sizeof(pixel));

        // Fill left border with rec. samples
        adiTemp = adiOrigin - 1;
        for (uint32_t i = 0; i < refSize - 1; i++)
            dst[i + refSize] = adiTemp[0];
            adiTemp += picStride;
    else // reference samples are partially available
		// 部分块可用
        const bool *bNeighborFlags = intraNeighbors.bNeighborFlags;
        const bool *pNeighborFlags;
        int aboveUnits = intraNeighbors.aboveUnits;
        int leftUnits = intraNeighbors.leftUnits;
        int unitWidth = intraNeighbors.unitWidth;
        int unitHeight = intraNeighbors.unitHeight;
        int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth);
        pixel adiLineBuffer[5 * MAX_CU_SIZE]; // 5 * 64 = 320
        pixel *adi;

        // Initialize
        for (int i = 0; i < totalSamples; i++)
            adiLineBuffer[i] = dcValue; // 全部初始化为dcValue

        // Fill top-left sample
			|adiOrigin|          | ...
			|         |  adiTemp | ...

			|leftUnits * unitHeight |adi| ...
        const pixel* adiTemp = adiOrigin - picStride - 1;
        adi = adiLineBuffer + (leftUnits * unitHeight);
        pNeighborFlags = bNeighborFlags + leftUnits;
        if (*pNeighborFlags)
            pixel topLeftVal = adiTemp[0];
            for (int i = 0; i < unitWidth; i++)
                adi[i] = topLeftVal; // 写入4个相同的值

        // Fill left & below-left samples
			|         |          | ...
			|adiOrigin|  adiTemp | ...
			|leftUnits * unitHeight |   | ...
        adiTemp += picStride;
        // NOTE: over copy here, but reduce condition operators
        for (int j = 0; j < leftUnits * unitHeight; j++)
            adi[-j] = adiTemp[j * picStride];

        // Fill above & above-right samples
			|         |adiOrigin | ...
			|		  |  adiTemp | ...
			|leftUnits * unitHeight |   |aboveUnits * unitWidth |
        adiTemp = adiOrigin - picStride;
        adi = adiLineBuffer + (leftUnits * unitHeight) + unitWidth;
        // NOTE: over copy here, but reduce condition operators
        memcpy(adi, adiTemp, aboveUnits * unitWidth * sizeof(*adiTemp));

        // Pad reference samples when necessary
		// 检查是否有必要填充参考像素
        int curr = 0;
        int next = 1;
        adi = adiLineBuffer;
        int pAdiLineTopRowOffset = leftUnits * (unitHeight - unitWidth);
		// 如果最左下角的块不可用,需要找到邻近块的像素值去进行填充
        if (!bNeighborFlags[0]) 
            // very bottom unit of bottom-left; at least one unit will be valid.
			// 检查至少到哪一个块开始是可用的
            while (next < totalUnits && !bNeighborFlags[next])
				(1)next < leftUnits,说明需要填充的位置位于左侧(或左下)
				(2)next >= leftUnits,说明需要填充的位置位于上方(或右上)
            pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
            const pixel refSample = *pAdiLineNext; // 将最邻近块的像素值作为填充值
            // Pad unavailable samples with new value
            int nextOrTop = X265_MIN(next, leftUnits); // 检查当前样本值位于左侧还是上方

            // fill left column
            while (curr < nextOrTop)
                for (int i = 0; i < unitHeight; i++)
                    adi[i] = refSample;

                adi += unitHeight;

            // fill top row
            while (curr < next)
                for (int i = 0; i < unitWidth; i++)
                    adi[i] = refSample;

                adi += unitWidth;
            X265_CHECK(curr <= nextOrTop, "curr must be less than or equal to nextOrTop\n");

			// 填充左侧像素
            if (curr < nextOrTop)
                const int fillSize = unitHeight * (nextOrTop - curr);
                memset(adi, refSample, fillSize * sizeof(pixel));
                curr = nextOrTop;
                adi += fillSize;
			// 填充上方像素
            if (curr < next)
                const int fillSize = unitWidth * (next - curr);
                memset(adi, refSample, fillSize * sizeof(pixel));
                curr = next;
                adi += fillSize;

        // pad all other reference samples.
		// 填充其他位置的像素值
        while (curr < totalUnits)
            if (!bNeighborFlags[curr]) // samples not available
                int numSamplesInCurrUnit = (curr >= leftUnits) ? unitWidth : unitHeight;
                const pixel refSample = *(adi - 1);
                for (int i = 0; i < numSamplesInCurrUnit; i++)
                    adi[i] = refSample;

                adi += numSamplesInCurrUnit;
                adi += (curr >= leftUnits) ? unitWidth : unitHeight;

        // Copy processed samples
        adi = adiLineBuffer + refSize + unitWidth - 2;
        memcpy(dst, adi, refSize * sizeof(pixel));

        adi = adiLineBuffer + refSize - 1;
        for (int i = 0; i < (int)refSize - 1; i++)
            dst[i + refSize] = adi[-(i + 1)];
} 基于不同模式的帧内预测(primitives.cu[sizeIdx].intra_pred)


void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main
#if X86_64
    p.scanPosLast = PFX(scanPosLast_x64);
	// 检查当前CPU支持SSE2的汇编加速
    if (cpuMask & X265_CPU_SSE2) 
        /* We do not differentiate CPUs which support MMX and not SSE2. We only check
         * for SSE2 and then use both MMX and SSE2 functions */
        // 进行sad函数的初始化
        AVC_LUMA_PU(sad, mmx2);
        AVC_LUMA_PU(sad_x3, mmx2);
        AVC_LUMA_PU(sad_x4, mmx2);
        p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_sse2);
        p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_sse2);
        p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_sse2);
        p.pu[LUMA_16x8].sad  = PFX(pixel_sad_16x8_sse2);
        p.pu[LUMA_16x8].sad_x3  = PFX(pixel_sad_x3_16x8_sse2);
        p.pu[LUMA_16x8].sad_x4  = PFX(pixel_sad_x4_16x8_sse2);
   		// ...
	// ...
	// 检查CPU是否支持SSE4指令集
	if (cpuMask & X265_CPU_SSE4)
		// ...
			定义Planar、DC和all_angs的基于SSE4实现的intra pred函数
        ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
        ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
        ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
			#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
		    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
		    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
		    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
		    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu)
			#define PFX3(prefix, name) prefix ## _ ## name
			#define PFX2(prefix, name) PFX3(prefix, name)
			#define PFX(name)          PFX2(X265_NS, name)

			p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse4;
			p.cu[BLOCK_8x8].intra_pred_allangs = x265_all_angs_pred_8x8_sse4;
			p.cu[BLOCK_16x16].intra_pred_allangs = x265_all_angs_pred_16x16_sse4;
			p.cu[BLOCK_32x32].intra_pred_allangs = x265_all_angs_pred_32x32_sse4;
    	// ...


; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
cglobal all_angs_pred_4x4, 4, 4, 8

; mode 2 // 模式2

movh      m0,         [r1 + 10]
movd      [r0],       m0

palignr   m1,         m0,      1
movd      [r0 + 4],   m1

palignr   m1,         m0,      2
movd      [r0 + 8],   m1

palignr   m1,         m0,      3
movd      [r0 + 12],  m1

; mode 3 // 模式3

mova          m2,        [pw_1024]

pslldq        m1,        m0,         1
pinsrb        m1,        [r1 + 9],   0
punpcklbw     m1,        m0

lea           r3,        [ang_table]

pmaddubsw     m6,        m1,        [r3 + 26 * 16]
pmulhrsw      m6,        m2
packuswb      m6,        m6
movd          [r0 + 16], m6

palignr       m0,        m1,        2

mova          m7,        [r3 + 20 * 16]

pmaddubsw     m3,        m0,        m7
pmulhrsw      m3,        m2
packuswb      m3,        m3
movd          [r0 + 20], m3

; mode 6 [row 3]
movd          [r0 + 76], m3

palignr       m3,        m1,       4

pmaddubsw     m4,        m3,        [r3 + 14 * 16]
pmulhrsw      m4,        m2
packuswb      m4,        m4
movd          [r0 + 24], m4

palignr       m4,        m1,        6

pmaddubsw     m4,        [r3 + 8 * 16]
pmulhrsw      m4,        m2
packuswb      m4,        m4
movd          [r0 + 28], m4
// ...


void setupIntraPrimitives_c(EncoderPrimitives& p)
    p.cu[BLOCK_4x4].intra_filter = intraFilter<4>;
    p.cu[BLOCK_8x8].intra_filter = intraFilter<8>;
    p.cu[BLOCK_16x16].intra_filter = intraFilter<16>;
    p.cu[BLOCK_32x32].intra_filter = intraFilter<32>;

    p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_c<2>;
    p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_c<3>;
    p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_c<4>;
    p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = planar_pred_c<5>;

    p.cu[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_c<4>;
    p.cu[BLOCK_8x8].intra_pred[DC_IDX] = intra_pred_dc_c<8>;
    p.cu[BLOCK_16x16].intra_pred[DC_IDX] = intra_pred_dc_c<16>;
    p.cu[BLOCK_32x32].intra_pred[DC_IDX] = intra_pred_dc_c<32>;
	// single angs初始化
    for (int i = 2; i < NUM_INTRA_MODE; i++)
        p.cu[BLOCK_4x4].intra_pred[i] = intra_pred_ang_c<4>;
        p.cu[BLOCK_8x8].intra_pred[i] = intra_pred_ang_c<8>;
        p.cu[BLOCK_16x16].intra_pred[i] = intra_pred_ang_c<16>;
        p.cu[BLOCK_32x32].intra_pred[i] = intra_pred_ang_c<32>;
	// all angs初始化,一般不使用或禁用
    p.cu[BLOCK_4x4].intra_pred_allangs = all_angs_pred_c<2>;
    p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_c<3>;
    p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_c<4>;
    p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_c<5>;

但是在实际使用时,如果使用C语言的intra pred的代码,会禁用all_angs_pred

void x265_setup_primitives(x265_param *param)
    if (!primitives.pu[0].sad)
    	// 设置C语言实现的基本函数

        /* We do not want the encoder to use the un-optimized intra all-angles
         * C references. It is better to call the individual angle functions
         * instead. We must check for NULL before using this primitive */
        // 会把前面C实现的all angs禁用
        for (int i = 0; i < NUM_TR_SIZE; i++)
            primitives.cu[i].intra_pred_allangs = NULL;

#if X265_ARCH_X86
        setupInstrinsicPrimitives(primitives, param->cpuid);
		// 设置汇编加速的函数
        setupAssemblyPrimitives(primitives, param->cpuid);
        if (param->cpuid & X265_CPU_ALTIVEC)
            setupPixelPrimitives_altivec(primitives);       // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
            setupDCTPrimitives_altivec(primitives);         // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
            setupFilterPrimitives_altivec(primitives);      // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
            setupIntraPrimitives_altivec(primitives);       // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions


        if (param->bLowPassDct)



template<int width>
void intra_pred_ang_c(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
    int width2 = width << 1;
    // Flip the neighbours in the horizontal case.
    int horMode = dirMode < 18; // 小于18,说明当前的模式为偏向水平的模式
    pixel neighbourBuf[129];
    const pixel *srcPix = srcPix0;

	// 如果是水平模式,需要转置
    if (horMode)
        neighbourBuf[0] = srcPix[0];
        for (int i = 0; i < width << 1; i++)
            neighbourBuf[1 + i] = srcPix[width2 + 1 + i];
            neighbourBuf[width2 + 1 + i] = srcPix[1 + i];
        srcPix = neighbourBuf;

    // Intra prediction angle and inverse angle tables.
    const int8_t angleTable[17] = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
    const int16_t invAngleTable[8] = { 4096, 1638, 910, 630, 482, 390, 315, 256 };

    // Get the prediction angle.
    int angleOffset = horMode ? 10 - dirMode : dirMode - 26; // 计算angle偏移量
    int angle = angleTable[8 + angleOffset];

    // Vertical Prediction.
    if (!angle)
        for (int y = 0; y < width; y++)
            for (int x = 0; x < width; x++)
                dst[y * dstStride + x] = srcPix[1 + x];

        if (bFilter)
            int topLeft = srcPix[0], top = srcPix[1];
            for (int y = 0; y < width; y++)
                dst[y * dstStride] = x265_clip((int16_t)(top + ((srcPix[width2 + 1 + y] - topLeft) >> 1)));
    else // Angular prediction.
        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf[1]).
        pixel refBuf[64];
        const pixel *ref;

        // Use the projected left neighbours and the top neighbours.
        if (angle < 0)
            // Number of neighbours projected. 
            int nbProjected = -((width * angle) >> 5) - 1;
            pixel *ref_pix = refBuf + nbProjected + 1;

            // Project the neighbours.
            int invAngle = invAngleTable[- angleOffset - 1];
            int invAngleSum = 128;
            for (int i = 0; i < nbProjected; i++)
                invAngleSum += invAngle;
                ref_pix[- 2 - i] = srcPix[width2 + (invAngleSum >> 8)];

            // Copy the top-left and top pixels.
            for (int i = 0; i < width + 1; i++)
                ref_pix[-1 + i] = srcPix[i];
            ref = ref_pix;
        else // Use the top and top-right neighbours.
            ref = srcPix + 1;

        // Pass every row.
        int angleSum = 0;
        for (int y = 0; y < width; y++)
            angleSum += angle;
            int offset = angleSum >> 5;
            int fraction = angleSum & 31;

            if (fraction) // Interpolate
                for (int x = 0; x < width; x++)
                    dst[y * dstStride + x] = (pixel)(((32 - fraction) * ref[offset + x] + fraction * ref[offset + x + 1] + 16) >> 5);
            else // Copy.
                for (int x = 0; x < width; x++)
                    dst[y * dstStride + x] = ref[offset + x];

    // Flip for horizontal.
    if (horMode)
        for (int y = 0; y < width - 1; y++)
            for (int x = y + 1; x < width; x++)
                pixel tmp              = dst[y * dstStride + x];
                dst[y * dstStride + x] = dst[x * dstStride + y];
                dst[x * dstStride + y] = tmp;
} 编码IntraLuma块(codeIntraLumaQT)


void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
    CUData& cu = mode.cu;
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
    uint32_t qtLayer    = log2TrSize - 2;
    uint32_t sizeIdx    = log2TrSize - 2;
		(1)如果TUSize <= depth上限,可以不继续划分
		(2)如果TUSize > depth下限,并且参数允许划分,可以继续划分

		PS: 感觉正常的逻辑应该是,只要位于depth范围之内都有可能继续划分?
    bool mightNotSplit  = log2TrSize <= depthRange[1];
    bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
    bool bEnableRDOQ  = !!m_param->rdoqLevel;

    /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
	// 如果使用最大RD penalty,强制32x32的TU划分为16x16
    if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
        mightNotSplit = false;
        mightSplit = true;

    Cost fullCost;
    uint32_t bCBF = 0;

    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;

    if (mightNotSplit)
        if (mightSplit)

        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
        uint32_t stride   = mode.fencYuv->m_size;

        // init availability pattern
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
        IntraNeighbors intraNeighbors;
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);

        // get prediction signal
		// 预测
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);

        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);

        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
        coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;

        // store original entropy coding status
        if (bEnableRDOQ)
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
        // 计算残差
		primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
		// 进行变换
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
        if (numSig)
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
            // no coded residual, recon = pred
            primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);

        bCBF = !!numSig << tuDepth;
        cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
		// 计算sse损失
			p.cu[BLOCK_4x4].sse_pp = PFX(pixel_ssd_4x4_ssse3);
        fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);

        if (!absPartIdx)
            if (!cu.m_slice->isIntra())
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
                m_entropyCoder.codeSkipFlag(cu, 0);
			// 编码partSize
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
		// 编码luma的ang
        if (cu.m_partSize[0] == SIZE_2Nx2N)
            if (!absPartIdx)
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
            if (!tuDepth)
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
            else if (!(absPartIdx & (qNumParts - 1)))
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
        if (log2TrSize != depthRange[0])
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
		// 编码cbf
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);

		// 对NxN变换块的系数进行编码
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
            m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
		// 计算一共消耗的编码比特数
        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
        if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
            fullCost.bits *= 4;
		// 根据不同配置计算rdcost
        if (m_rdCost.m_psyRd)
            fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
        else if(m_rdCost.m_ssimRd)
            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
        fullCost.rdcost = MAX_INT64;
	// 如果允许继续划分,会分成子块去进行编码
    if (mightSplit)
        if (mightNotSplit)
            m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode

        /* code split block */
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;

        int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
        if (m_param->bEnableTSkipFast)
            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;

        Cost splitCost;
        uint32_t cbf = 0;
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
            if (checkTransformSkip)
                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);

            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);

        if (mightNotSplit && log2TrSize != depthRange[0])
            /* If we could have coded this TU depth, include cost of subdiv flag */
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
            splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();

            if (m_rdCost.m_psyRd)
                splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
            else if(m_rdCost.m_ssimRd)
                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
                splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);

        if (splitCost.rdcost < fullCost.rdcost)
            outCost.rdcost     += splitCost.rdcost;
            outCost.distortion += splitCost.distortion;
            outCost.bits       += splitCost.bits;
            outCost.energy     += splitCost.energy;
            // recover entropy state of full-size TU encode

            // recover transform index and Cbf values
            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
            cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
            cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);

    // set reconstruction for next intra prediction blocks if full TU prediction won
	// 存储重建帧信息,用于intra prediction
    PicYuv*  reconPic = m_frame->m_reconPic;
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
    intptr_t picStride = reconPic->m_stride;
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
	// 存储开销
    outCost.rdcost     += fullCost.rdcost;
    outCost.distortion += fullCost.distortion;
    outCost.bits       += fullCost.bits;
    outCost.energy     += fullCost.energy;

2.2 确认最佳模式(checkBestMode)


/* check whether current mode is the new best */
inline void checkBestMode(Mode& mode, uint32_t depth)
    ModeDepth& md = m_modeDepth[depth];
    if (md.bestMode)
        if (mode.rdCost < md.bestMode->rdCost)
            md.bestMode = &mode;
        md.bestMode = &mode;






