【x265编码器】章节3——x265帧内预测流程

Captain1314_李祖团

已于 2024-03-24 20:53:02 修改

阅读量3.8w

点赞数 6

分类专栏：视频编码器文章标签： windows linux h.265 视频编解码音视频 ffmpeg

于 2023-08-28 00:23:50 首次发布

本文链接：https://blog.csdn.net/vcvdv123/article/details/132417145

版权

视频编码器专栏收录该内容

15 篇文章 9 订阅

订阅专栏

系列文章目录

HEVC视频编解码标准简介

【x264编码器】章节1——x264编码流程及基于x264的编码器demo

【x264编码器】章节2——x264的lookahead流程分析

【x264编码器】章节3——x264的码率控制

【x264编码器】章节4——x264的帧内预测流程

【x264编码器】章节5——x264的帧间预测流程

【x264编码器】章节6——x264的变换量化

【x265编码器】章节1——lookahead模块分析

【x265编码器】章节2——编码流程及基于x265的编码器demo

【x265编码器】章节3——帧内预测流程

【x265编码器】章节4——帧间预测流程

【x265编码器】章节5——x265帧间运动估计流程

【x265编码器】章节6——x265的码率控制

1.行编码FrameEncoder::processRowEncoder

2.Analysis::compressCTU

3.帧内预测Analysis::compressIntraCU

4.Intra模式结果检查Search::checkIntra

5.帧内亮度预测模块Search::estIntraPredQT

6.帧内色度预测模块Search::estIntraPredQT

一、帧内预测编码流程

本章对x265的帧内预测流程做了分析，同时给出了帧内预测模式的流程图，如图中紫色方块的部分，同时第二章节对代码做了详细的分析：

x265完整的流程框架如下：

二、代码分析

1.行编码FrameEncoder::processRowEncoder

该函数是由工作线程调用的，用于处理图像编码的每一行数据，代码分析如下

void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
{
    const uint32_t row = (uint32_t)intRow;
    CTURow& curRow = m_rows[row];

    if (m_param->bEnableWavefront)
    {   //如果启用了波前并行处理（bEnableWavefront 为真），则对 curRow 进行加锁
        ScopedLock self(curRow.lock);
        if (!curRow.active)
            /* VBV restart is in progress, exit out */
            return;
        if (curRow.busy)//检查 curRow 是否已被其他线程占用
        {
            /* On multi-socket Windows servers, we have seen problems with
             * ATOMIC_CAS which resulted in multiple worker threads processing
             * the same CU row, which often resulted in bad pointer accesses. We
             * believe the problem is fixed, but are leaving this check in place
             * to prevent crashes in case it is not */
            x265_log(m_param, X265_LOG_WARNING,
                     "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
            return;
        }
        curRow.busy = true;
    }

    /* When WPP is enabled, every row has its own row coder instance. Otherwise
     * they share row 0 *///如果未启用波前并行处理，则对行号为0的 CTURow 对象的 rowGoOnCoder 进行引用，命名为 rowCoder
    Entropy& rowCoder = m_param->bEnableWavefront ? curRow.rowGoOnCoder : m_rows[0].rowGoOnCoder;
    FrameData& curEncData = *m_frame->m_encData;
    Slice *slice = curEncData.m_slice;
    //如果未启用波前并行处理，则对行号为0的 CTURow 对象的 rowGoOnCoder 进行引用，命名为 rowCoder
    const uint32_t numCols = m_numCols;
    const uint32_t lineStartCUAddr = row * numCols;
    bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
    //获取当前行所属的切片ID
    const uint32_t sliceId = curRow.sliceId;//计算最大的块列数、每个 CU 的块数以及当前行是否为切片的第一行和最后一行
    uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
    uint32_t noOfBlocks = m_param->maxCUSize / 16;
    const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0;
    const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
    const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
    const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
    //如果当前行是切片的第一行且 curRow 未完成编码，则从初始的切片上下文中加载 SBAC（Syntax-based Arithmetic Coding）编码器的上下文状态到 rowCoder
    // Load SBAC coder context from previous row and initialize row state.
    if (bFirstRowInSlice && !curRow.completed)        
        rowCoder.load(m_initSliceContext);     

    // calculate mean QP for consistent deltaQP signalling calculation
    if (m_param->bOptCUDeltaQP)
    {   //获取一个互斥锁，以确保对curRow的访问是线程安全的
        ScopedLock self(curRow.lock);
        if (!curRow.avgQPComputed)
        {
            if (m_param->bEnableWavefront || !row)
            {   //该变量用于计算平均QP偏移量
                double meanQPOff = 0;
                bool isReferenced = IS_REFERENCED(m_frame);
                double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
                if (qpoffs)
                {
                    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;

                    uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_picHeight;
                    if (m_param->bEnableWavefront)
                    {
                        cuYStart = intRow * m_param->maxCUSize;
                        height = cuYStart + m_param->maxCUSize;
                    }

                    uint32_t qgSize = m_param->rc.qgSize, width = m_frame->m_fencPic->m_picWidth;
                    uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
                    uint32_t count = 0;
                    for (uint32_t cuY = cuYStart; cuY < height && (cuY < m_frame->m_fencPic->m_picHeight); cuY += qgSize)
                    {
                        for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
                        {
                            double qp_offset = 0;
                            uint32_t cnt = 0;

                            for (uint32_t block_yy = cuY; block_yy < cuY + qgSize && block_yy < m_frame->m_fencPic->m_picHeight; block_yy += loopIncr)
                            {
                                for (uint32_t block_xx = cuX; block_xx < cuX + qgSize && block_xx < width; block_xx += loopIncr)
                                {   //通过嵌套的循环迭代遍历图像的像素块，计算每个像素块的QP偏移量，并将其累加到meanQPOff中。
                                    int idx = ((block_yy / loopIncr) * (maxOffsetCols)) + (block_xx / loopIncr);
                                    qp_offset += qpoffs[idx];
                                    cnt++;
                                }
                            }
                            qp_offset /= cnt;
                            meanQPOff += qp_offset;
                            count++;
                        }
                    }
                    meanQPOff /= count;
                }//最后，通过除以相应的计数值，计算出所有像素块的平均QP偏移量，并将其与slice->m_sliceQp相加，得到最终的平均QP值
                rowCoder.m_meanQP = slice->m_sliceQp + meanQPOff;
            }
            else
            {
                rowCoder.m_meanQP = m_rows[0].rowGoOnCoder.m_meanQP;
            }
            curRow.avgQPComputed = 1;
        }
    }
    //用于初始化切片中运动矢量范围（MV range）的限制
    // Initialize restrict on MV range in slices
    tld.analysis.m_sliceMinY = -(int32_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
    tld.analysis.m_sliceMaxY = (int32_t)((endRowInSlicePlus1 - 1 - row) * (m_param->maxCUSize * 4) - 4 * 4);

    // Handle single row slice
    if (tld.analysis.m_sliceMaxY < tld.analysis.m_sliceMinY)
        tld.analysis.m_sliceMaxY = tld.analysis.m_sliceMinY = 0;


    while (curRow.completed < numCols)
    {
        ProfileScopeEvent(encodeCTU);

        const uint32_t col = curRow.completed;
        const uint32_t cuAddr = lineStartCUAddr + col;
        CUData* ctu = curEncData.getPicCTU(cuAddr);
        const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
        ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);//调用 ctu 对象的 initCTU 方法，初始化 CTU

        if (bIsVbv)
        {
            if (col == 0 && !m_param->bEnableWavefront)
            {
                m_backupStreams[0].copyBits(&m_outStreams[0]);
                curRow.bufferedEntropy.copyState(rowCoder);
                curRow.bufferedEntropy.loadContexts(rowCoder);
            }
            if (bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId] != intRow)
            {
                curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
            }

            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
            if (m_param->bEnableWavefront && rowInSlice >= col && !bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId] != intRow)
                cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
            else if (!m_param->bEnableWavefront && !bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId] != intRow)
                cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
            else
                cuStat.baseQp = curEncData.m_rowStat[row].rowQp;

            /* TODO: use defines from slicetype.h for lowres block size */
            uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks;
            uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks;
            if (!m_param->analysisLoad || !m_param->bDisableLookahead)
            {
                cuStat.vbvCost = 0;
                cuStat.intraVbvCost = 0;

                for (uint32_t h = 0; h < noOfBlocks && block_y < m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
                {
                    uint32_t idx = block_x + (block_y * maxBlockCols);

                    for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++, idx++)
                    {
                        cuStat.vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
                        cuStat.intraVbvCost += m_frame->m_lowres.intraCost[idx];
                    }
                }
            }
        }
        else//否则使用平均qp
            curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc;
        //如果启用了波前并且当前列是第一列且不是切片的第一行，则执行以下操作
        if (m_param->bEnableWavefront && !col && !bFirstRowInSlice)
        {   //从上一行复制SBAC编码器的上下文状态，加载上一行的缓冲熵信息到当前行的SBAC编码器，这些操作的目的是为了在波前编码模式下保持上下文的连续性
            // Load SBAC coder context from previous row and initialize row state.
            rowCoder.copyState(m_initSliceContext);
            rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
        }//如果启用了动态码率控制且QPA码率比QP非VBV码率高，则执行以下操作
        if (m_param->dynamicRd && (int32_t)(m_rce.qpaRc - m_rce.qpNoVbv) > 0)
            ctu->m_vbvAffected = true;//表示CTU受到VBV（视频缓冲区）的影响
        //进行CU分析，返回最佳的模式决策
        // Does all the CU analysis, returns best top level mode decision
        Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
        //如果启用了动态细化并且当前帧的起始点小于等于编码顺序，则执行以下操作
        /* startPoint > encodeOrder is true when the start point changes for
        a new GOP but few frames from the previous GOP is still incomplete.
        The data of frames in this interval will not be used by any future frames. */
        if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder)
            collectDynDataRow(*ctu, &curRow.rowStats);
        //原子地将m_activeWorkerCount的值添加到m_totalActiveWorkerCount中
        // take a sample of the current active worker count
        ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
        ATOMIC_INC(&m_activeWorkerCountSamples);//原子地将m_activeWorkerCountSamples的值加1
        //对当前CTU进行编码，将其上下文包含在rowCoder中。如果SAO（样点自适应偏差调整）被禁用，则rowCoder将写入最终的CTU比特流
        /* advance top-level row coder to include the context of this CTU.
         * if SAO is disabled, rowCoder writes the final CTU bitstream */
        rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
        //如果启用了波前并且当前列是第2列，则执行以下操作
        if (m_param->bEnableWavefront && col == 1)
            // Save CABAC state for next row 保存下一行的CABAC状态，这些操作的目的是为了在波前编码模式下保持CABAC状态的连续性
            curRow.bufferedEntropy.loadContexts(rowCoder);
        //如果切片使用SAO并且启用了非去块滤波的SAO参数估计，则执行以下操作
        /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
        if (slice->m_bUseSao && m_param->bSaoNonDeblocked)
            m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
        //进行去块滤波
        /* Deblock with idle threading */
        if (m_param->bEnableLoopFilter | slice->m_bUseSao)
        {
            // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
            if (!bIsVbv)
            {   //如果存在线程池并且不是切片的第一行
                // Delay one row to avoid intra prediction conflict
                if (m_pool && !bFirstRowInSlice)
                {   //初始化允许进行去块滤波的列为当前列                
                    int allowCol = col;
                    //如果行索引大于等于2
                    // avoid race condition on last column
                    if (rowInSlice >= 2)
                    {   //根据前两行的状态确定允许进行去块滤波的列。如果当前列是最后一列，则使用上一行的最后一个已去块滤波的列作为允许列，否则使用上一行的最后一个列作为允许列
                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
                                                                  : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
                    }//设置上一行的允许列
                    m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
                }
                //如果存在线程池并且是切片的最后一行，则执行以下操作
                // Last Row may start early
                if (m_pool && bLastRowInSlice)
                {
                    // Deblocking last row
                    int allowCol = col;

                    // avoid race condition on last column
                    if (rowInSlice >= 2)
                    {   //根据上一行的状态确定允许进行去块滤波的列。如果当前列是最后一列，则使用当前行的最后一个已去块滤波的列作为允许列，否则使用当前行的最后一个列作为允许列
                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
                                                                  : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
                    }//设置当前行的允许列
                    m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
                }
            } // end of !bIsVbv
        }
        // Both Loopfilter and SAO Disabled
        else
        {   //对当前行和列进行后处理，跳过去块滤波和SAO
            m_frameFilter.m_parallelFilter[row].processPostCu(col);
        }

        // Completed CU processing 增加当前行已完成的CU数量
        curRow.completed++;
        //创建一个名为frameLog的FrameStats对象，用于记录帧统计信息
        FrameStats frameLog;
        curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);

        // copy number of intra, inter cu per row into frame stats for 2 pass
        if (m_param->rc.bStatWrite)
        {
            curRow.rowStats.mvBits    += best.mvBits;
            curRow.rowStats.coeffBits += best.coeffBits;
            curRow.rowStats.miscBits  += best.totalBits - (best.mvBits + best.coeffBits);

            for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
            {
                /* 1 << shift == number of 8x8 blocks at current depth */
                int shift = 2 * (m_param->maxCUDepth - depth);
                int cuSize = m_param->maxCUSize >> depth;

                curRow.rowStats.intra8x8Cnt += (cuSize == 8) ? (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
                                                               (int)(frameLog.cntIntra[depth] << shift);

                curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
                curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
            }
        }//这些操作用于累加统计信息，以便在处理完所有CU后，可以获得当前行的统计数据
        curRow.rowStats.totalCtu++;//增加当前行的总CTU数
        curRow.rowStats.lumaDistortion   += best.lumaDistortion;//将最佳CU的亮度失真累加到当前行的lumaDistortion中
        curRow.rowStats.chromaDistortion += best.chromaDistortion;
        curRow.rowStats.psyEnergy        += best.psyEnergy;//将最佳CU的心理能量累加到当前行的psyEnergy中
        curRow.rowStats.ssimEnergy       += best.ssimEnergy;
        curRow.rowStats.resEnergy        += best.resEnergy;
        curRow.rowStats.cntIntraNxN      += frameLog.cntIntraNxN;
        curRow.rowStats.totalCu          += frameLog.totalCu;
        for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
        {
            curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth];
            curRow.rowStats.cntMergeCu[depth] += frameLog.cntMergeCu[depth];
            for (int m = 0; m < INTER_MODES; m++)
                curRow.rowStats.cuInterDistribution[depth][m] += frameLog.cuInterDistribution[depth][m];
            for (int n = 0; n < INTRA_MODES; n++)
                curRow.rowStats.cuIntraDistribution[depth][n] += frameLog.cuIntraDistribution[depth][n];
        }
        //用于记录CU的总比特数
        curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
        x265_emms();

        if (bIsVbv)
        {   
            // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled
            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];    
            if ((m_param->bEnableWavefront && ((cuAddr == m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
            {
                curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
                curEncData.m_rowStat[row].rowIntraSatd += cuStat.intraVbvCost;
                curEncData.m_rowStat[row].encodedBits += cuStat.totalBits;
                curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
                curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
            }
            
            // If current block is at row end checkpoint, call vbv ratecontrol.
            if (!m_param->bEnableWavefront && col == numCols - 1)
            {
                double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
                curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
                qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
                curEncData.m_rowStat[row].rowQp = qpBase;
                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
                if (curRow.reEncode < 0)
                {
                    x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
                        m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);

                    m_vbvResetTriggerRow[curRow.sliceId] = row;
                    m_outStreams[0].copyBits(&m_backupStreams[0]);

                    rowCoder.copyState(curRow.bufferedEntropy);
                    rowCoder.loadContexts(curRow.bufferedEntropy);

                    curRow.completed = 0;
                    memset(&curRow.rowStats, 0, sizeof(curRow.rowStats));
                    curEncData.m_rowStat[row].numEncodedCUs = 0;
                    curEncData.m_rowStat[row].encodedBits = 0;
                    curEncData.m_rowStat[row].rowSatd = 0;
                    curEncData.m_rowStat[row].rowIntraSatd = 0;
                    curEncData.m_rowStat[row].sumQpRc = 0;
                    curEncData.m_rowStat[row].sumQpAq = 0;
                }
            }
            // If current block is at row diagonal checkpoint, call vbv ratecontrol.
            else if (m_param->bEnableWavefront && rowInSlice == col && !bFirstRowInSlice)
            {
                if (m_param->rc.bEnableConstVbv)
                {
                    uint32_t startCuAddr = numCols * row;
                    uint32_t EndCuAddr = startCuAddr + col;

                    for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId]; r--)
                    {
                        for (uint32_t c = startCuAddr; c <= EndCuAddr && c <= numCols * (r + 1) - 1; c++)
                        {
                            curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
                            curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
                            curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
                            curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
                            curEncData.m_rowStat[r].numEncodedCUs = c;
                        }
                        if (curRow.reEncode < 0)
                            break;
                        startCuAddr = EndCuAddr - numCols;
                        EndCuAddr = startCuAddr + 1;
                    }
                }
                double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
                curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
                qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
                curEncData.m_rowStat[row].rowQp = qpBase;
                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);

                if (curRow.reEncode < 0)
                {
                    x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
                             m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);

                    // prevent the WaveFront::findJob() method from providing new jobs
                    m_vbvResetTriggerRow[curRow.sliceId] = row;
                    m_bAllRowsStop[curRow.sliceId] = true;

                    for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r >= row; r--)
                    {
                        CTURow& stopRow = m_rows[r];

                        if (r != row)
                        {
                            /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
                            stopRow.lock.acquire();
                            while (stopRow.active)
                            {
                                if (dequeueRow(m_row_to_idx[r] * 2))
                                    stopRow.active = false;
                                else
                                {
                                    /* we must release the row lock to allow the thread to exit */
                                    stopRow.lock.release();
                                    GIVE_UP_TIME();
                                    stopRow.lock.acquire();
                                }
                            }
                            stopRow.lock.release();

                            bool bRowBusy = true;
                            do
                            {
                                stopRow.lock.acquire();
                                bRowBusy = stopRow.busy;
                                stopRow.lock.release();

                                if (bRowBusy)
                                {
                                    GIVE_UP_TIME();
                                }
                            }
                            while (bRowBusy);
                        }

                        m_outStreams[r].resetBits();
                        stopRow.completed = 0;
                        memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats));
                        curEncData.m_rowStat[r].numEncodedCUs = 0;
                        curEncData.m_rowStat[r].encodedBits = 0;
                        curEncData.m_rowStat[r].rowSatd = 0;
                        curEncData.m_rowStat[r].rowIntraSatd = 0;
                        curEncData.m_rowStat[r].sumQpRc = 0;
                        curEncData.m_rowStat[r].sumQpAq = 0;
                    }

                    m_bAllRowsStop[curRow.sliceId] = false;
                }
            }
        }

        if (m_param->bEnableWavefront && curRow.completed >= 2 && !bLastRowInSlice &&
            (!m_bAllRowsStop[curRow.sliceId] || intRow + 1 < m_vbvResetTriggerRow[curRow.sliceId]))
        {
            /* activate next row */
            ScopedLock below(m_rows[row + 1].lock);
            //检查下一行是否未激活并且已完成的列数加2小于等于当前行已完成的列数
            if (m_rows[row + 1].active == false &&
                m_rows[row + 1].completed + 2 <= curRow.completed)
            {   //将下一行设置为激活状态
                m_rows[row + 1].active = true;
                enqueueRowEncoder(m_row_to_idx[row + 1]);//将下一行的索引加入到编码器的行队列中
                tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */
            }
        }

        ScopedLock self(curRow.lock);
        if ((m_bAllRowsStop[curRow.sliceId] && intRow > m_vbvResetTriggerRow[curRow.sliceId]) ||
            (!bFirstRowInSlice && ((curRow.completed < numCols - 1) || (m_rows[row - 1].completed < numCols)) && m_rows[row - 1].completed < curRow.completed + 2))
        {   //将当前行设置为非活跃状态
            curRow.active = false;
            curRow.busy = false;
            ATOMIC_INC(&m_countRowBlocks);
            return;
        }
    }
    //对已压缩的CTU行进行统计和更新的操作
    /* this row of CTUs has been compressed */
    if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
    {
        if (bLastRowInSlice)       
        {
            for (uint32_t r = m_sliceBaseRow[sliceId]; r < m_sliceBaseRow[sliceId + 1]; r++)
            {
                for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < numCols * (r + 1); c++)
                {
                    curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
                    curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
                    curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
                    curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
                    curEncData.m_rowStat[r].numEncodedCUs = c;
                }
            }
        }
    }
    //用于在编码过程中更新比特率控制中的比特数和复杂度信息，以便下一帧的比特率控制能够更准确地估计。在编码开始时，我们在一半的帧被编码后更新统计信息，但在这个初始阶段之后，我们会在参考帧完成编码之前的refLagRows（参考帧必须完成的行数）之后进行更新
    /* If encoding with ABR, update update bits and complexity in rate control
     * after a number of rows so the next frame's rateControlStart has more
     * accurate data for estimation. At the start of the encode we update stats
     * after half the frame is encoded, but after this initial period we update
     * after refLagRows (the number of rows reference frames must have completed
     * before referencees may begin encoding) */
    if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
    {
        uint32_t rowCount = 0;//用于存储需要更新统计信息的行数
        uint32_t maxRows = m_sliceBaseRow[sliceId + 1] - m_sliceBaseRow[sliceId];//计算当前切片中的最大行数
        //如果编码顺序为0（刚开始编码），则更新的行数为maxRows - 1
        if (!m_rce.encodeOrder)
            rowCount = maxRows - 1; //如果编码顺序小于等于当前帧率的两倍，则更新的行数为
        else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
            rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
        else//对于其他情况，更新的行数为
            rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
        //如果当前行在需要更新统计信息的行数范围内
        if (rowInSlice == rowCount)
        {   //将当前切片的总比特数初始化为0
            m_rowSliceTotalBits[sliceId] = 0;
            if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront))
            {   //根据不同情况，累加当前切片中各行或各CTU的比特数到
                for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
                    m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
            }
            else
            {
                uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
                uint32_t finishAddr = startAddr + rowCount * numCols;
                
                for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
                    m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
            }
            //如果当前切片计数达到了最大切片数
            if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
            {   //遍历所有切片，累加各切片的总比特数到m_rce.rowTotalBits中，更新比特率控制的统计信息
                m_rce.rowTotalBits = 0;
                for (uint32_t i = 0; i < m_param->maxSlices; i++)
                    m_rce.rowTotalBits += m_rowSliceTotalBits[i];
                m_top->m_rateControl->rateControlUpdateStats(&m_rce);
            }
        }
    }
    //如果不使用SAO（样点自适应优化）并且开启了波前并行处理（WPP），则刷新当前行的比特流；否则，如果不使用SAO且不开启WPP，则刷新整个帧的比特流
    /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
    /* end_of_sub_stream_one_bit / end_of_slice_segment_flag */
       if (!slice->m_bUseSao && (m_param->bEnableWavefront || bLastRowInSlice))
               rowCoder.finishSlice();

    //如果开启了环路滤波（Loop Filter）或者使用了SAO，并且当前行在切片内的行号大于等于2，则执行以下操作。
    /* Processing left Deblock block with current threading */
    if ((m_param->bEnableLoopFilter | slice->m_bUseSao) & (rowInSlice >= 2))
    {   //检查前一行的最后一个已解块的列数是否等于图像的列数
        /* Check conditional to start previous row process with current threading */
        if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
        {   //如果相等，停止当前行的线程处理并重新启动它。具体操作是将当前行的允许处理的列数设置为图像的列数，并调用processTasks(-1)重新开始线程处理
            /* stop threading on current row and restart it */
            m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
            m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
        }
    }

    /* trigger row-wise loop filters */
    if (m_param->bEnableWavefront)
    {    //延迟处理
        if (rowInSlice >= m_filterRowDelay)
        {
            enableRowFilter(m_row_to_idx[row - m_filterRowDelay]);
            //如果当前行是切片内的第一行（行号为0），则激活行过滤器
            /* NOTE: Activate filter if first row (row 0) */
            if (rowInSlice == m_filterRowDelay)
                enqueueRowFilter(m_row_to_idx[row - m_filterRowDelay]);
            tryWakeOne();
        }
        //如果当前行是切片内的最后一行
        if (bLastRowInSlice)
        {   //对于切片内的最后m_filterRowDelay行，启用相应行索引的行过滤器
            for (uint32_t i = endRowInSlicePlus1 - m_filterRowDelay; i < endRowInSlicePlus1; i++)
            {
                enableRowFilter(m_row_to_idx[i]);
            }
            tryWakeOne();
        }
        //如果当前切片只有一行
        // handle specially case - single row slice
        if  (bFirstRowInSlice & bLastRowInSlice)
        {   //当前行的行索引加入行过滤器队列
            enqueueRowFilter(m_row_to_idx[row]);
            tryWakeOne();
        }
    }

    curRow.busy = false;

    // CHECK_ME: Does it always FALSE condition?
    if (ATOMIC_INC(&m_completionCount) == 2 * (int)m_numRows)
        m_completionEvent.trigger();
}

2.Analysis::compressCTU

函数功能，进行CU分析，返回最佳的模式决策，根据帧类型不同，进行帧内预测或者帧间预测：

Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
{
    m_slice = ctu.m_slice;
    m_frame = &frame;
    m_bChromaSa8d = m_param->rdLevel >= 3;
    m_param = m_frame->m_param;

#if _DEBUG || CHECKED_BUILD
    invalidateContexts(0);
#endif
    //根据给定的 CTU、切片和几何参数，计算并设置量化参数 qp，并返回其值,将相同的量化参数应用于 CTU 的所有子块
    int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
    ctu.setQPSubParts((int8_t)qp, 0, 0);

    m_rqt[0].cur.load(initialContext);
    ctu.m_meanQP = initialContext.m_meanQP;
    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
    //这可能是为了计算归一化因子，用于 SSIM（结构相似性）相关的速率失真优化
    if (m_param->bSsimRd)
        calculateNormFactor(ctu, qp);

    uint32_t numPartition = ctu.m_numPartitions;
    if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr))
    {
        x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr;
        int32_t depthIdx = 0;
        uint32_t maxNum8x8Partitions = 64;
        uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr];
        uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
        int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
        do
        {
            uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx];
            uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx));
            int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx];
            memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth);
            memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth);
            memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth);
            for (uint32_t l = 0; l < numPartition >> 2 * depth; l++)
                prevCtuInfoChangePtr[l] = prevCtuInfoChange;
            depthInfoPtr += ctu.m_numPartitions >> 2 * depth;
            contentInfoPtr += ctu.m_numPartitions >> 2 * depth;
            prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth;
            depthIdx++;
        } while (ctuTemp->ctuPartitions[depthIdx] != 0);

        m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
        m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
        memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition);
        //Calculate log2CUSize from depth
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
            ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
    }
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
    {   //根据当前切片类型确定预测方向的数量。如果是P帧，则预测方向数量为1（单向预测），否则为2（双向预测）
        int numPredDir = m_slice->isInterP() ? 1 : 2;
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
        for (int dir = 0; dir < numPredDir; dir++)//循环遍历预测方向
        {
            m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
            m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
        }
        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
    }
    
    int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel);
    if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10)
    {   //根据当前切片类型确定预测方向的数量，与之前的代码段相同
        int numPredDir = m_slice->isInterP() ? 1 : 2;
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
        if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) ||
            ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7)))
            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
        if (reuseLevel > 4)
        {
            m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
            m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
        }
        if (m_param->analysisSave && !m_param->analysisLoad)//循环遍历预测模式
            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
                m_reuseRef[i] = -1;
    }
    ProfileCUScope(ctu, totalCTUTime, totalCTUs);

    if (m_slice->m_sliceType == I_SLICE)
    {
        x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
        if (m_param->analysisLoadReuseLevel > 1)
        {
            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
        }//对当前的CTU进行帧内预测编码
        compressIntraCU(ctu, cuGeom, qp);
    }
    else
    {
        bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16));
        bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4);
        bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
        bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6;
        //根据一系列条件判断是否需要复制分析数据
        if (bCopyAnalysis)
        {
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
            int posCTU = ctu.m_cuAddr * numPartition;//复制interDataCTU中的相关数据到ctu中，包括cuDepth、predMode、partSize和skipFlag
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
                memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);

            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
            {
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
            }
            //Calculate log2CUSize from depth 根据深度计算log2CUSize
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
        }
        //执行compressIntraCU函数进行帧内压缩
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
            ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
            && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
            compressIntraCU(ctu, cuGeom, qp);
        else if (!m_param->rdLevel)//如果rdLevel为0
        {   //将源像素复制到重构块中，以便进行帧内预测
            /* In RD Level 0/1, copy source pixels into the reconstructed block so
             * they are available for intra predictions */
            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
            //执行compressInterCU_rd0_4函数对CTU进行压缩
            compressInterCU_rd0_4(ctu, cuGeom, qp);
            //生成整个CTU的残差并复制到重构图像
            /* generate residual for entire CTU at once and copy to reconPic */
            encodeResidue(ctu, cuGeom);
        }
        else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) ||
                ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16))
        {
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
            int posCTU = ctu.m_cuAddr * numPartition;
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
            {
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
            }
            //Calculate log2CUSize from depth 根据深度计算log2CUSize
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];

            qprdRefine (ctu, cuGeom, qp, qp);
            return *m_modeDepth[0].bestMode;
        }
        else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
            compressInterCU_dist(ctu, cuGeom, qp);
        else if (m_param->rdLevel <= 4)//如果rdLevel小于等于4，则执行compressInterCU_rd0_4函数对CTU进行压缩
            compressInterCU_rd0_4(ctu, cuGeom, qp);
        else//执行compressInterCU_rd5_6函数对CTU进行压缩
            compressInterCU_rd5_6(ctu, cuGeom, qp);
    }

    if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
        qprdRefine(ctu, cuGeom, qp, qp);

    if (m_param->csvLogLevel >= 2)
        collectPUStatistics(ctu, cuGeom);

    return *m_modeDepth[0].bestMode;
}

3.帧内预测Analysis::compressIntraCU

实现了对子CU的递归遍历过程。首先根据条件判断是否需要进行分割，然后对每个子CU进行压缩，计算代价并保存最佳CU和预测数据，最后更新代价和上下文信息等，代码分析如下：

uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
{
    uint32_t depth = cuGeom.depth;
    ModeDepth& md = m_modeDepth[depth];
    md.bestMode = NULL;
    //判断是否可以对CU进行分割
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
    //判断是否已经决定了当前CU的模式。如果intraRefine不等于4且parentCTU中已经有lumaIntraDir数据且不是全部为ALL_IDX，且不是HEVC_INFO分析类型，则认为已经决定了模式
    bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
    bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
    int split = 0;
    if (m_param->intraRefine && m_param->intraRefine != 4)
    {   //根据参数和条件判断是否需要进行进一步的分割。根据scaleFactor、bDecidedDepth和mightNotSplit的值来确定是否需要进行分割
        split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
            ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
        if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
            bAlreadyDecided = false;
    }

    if (bAlreadyDecided)
    {
        if (bDecidedDepth && mightNotSplit)
        {   //如果已经决定了分割深度且不可分割，则选择md.pred[0]作为最佳模式，并进行相应的处理和检查
            Mode& mode = md.pred[0];
            md.bestMode = &mode;
            mode.cu.initSubCU(parentCTU, cuGeom, qp);
            bool reuseModes = !((m_param->intraRefine == 3) ||
                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
            if (reuseModes)
            {
                memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
                memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
            }
            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
    
            if (m_bTryLossless)
                tryLossless(cuGeom);

            if (mightSplit)
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
        }
    }//cuGeom.log2CUSize != MAX_LOG2_CU_SIZE的判断条件，使得不会出现64x64尺寸的CU块，因为不会计算64x64块的代价
    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
    {    //如果尚未决定模式且当前CU的log2CUSize不等于最大CU尺寸且不可分割，则选择md.pred[PRED_INTRA]作为模式，并进行相应的处理和检查
        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
        checkBestMode(md.pred[PRED_INTRA], depth);
        //如果当前CU的log2CUSize等于3且slice的sps中的quadtreeTULog2MinSize小于3，则选择md.pred[PRED_INTRA_NxN]作为模式，并进行相应的处理和检查
        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
        {   //也就是帧内预测最小可以到4x4
            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
        }
        //根据需要尝试无损压缩
        if (m_bTryLossless)
            tryLossless(cuGeom);
        //如果可以进行分割，则根据最佳模式的深度添加分割标志的代价
        if (mightSplit)
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
    }
    //根据条件判断是否需要进行分割。根据之前的分析决策和深度，以及参数split的值来确定是否需要进行分割
    // stop recursion if we reach the depth of previous analysis decision
    mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;

    if (mightSplit)
    {   //如果需要进行分割，则创建一个splitPred对象，并初始化其代价。创建一个splitCU对象，并将其初始化为当前CU的子CU
        Mode* splitPred = &md.pred[PRED_SPLIT];
        splitPred->initCosts();
        CUData* splitCU = &splitPred->cu;
        splitCU->initSubCU(parentCTU, cuGeom, qp);
        //根据深度获取下一个深度的ModeDepth对象
        uint32_t nextDepth = depth + 1;
        ModeDepth& nd = m_modeDepth[nextDepth];
        invalidateContexts(nextDepth);//清空下一个深度的上下文信息
        Entropy* nextContext = &m_rqt[depth].cur;
        int32_t nextQP = qp;
        uint64_t curCost = 0;
        int skipSplitCheck = 0;//初始化当前代价为0，并设置skipSplitCheck标志为0。
        //遍历当前CU的4个子CU
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
        {
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
            if (childGeom.flags & CUGeom::PRESENT)
            {   //对于存在的子CU，将父CU的fencYuv数据拷贝到子CU的nd.fencYuv中。加载下一个深度的上下文信息
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                m_rqt[nextDepth].cur.load(*nextContext);

                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
                //如果bEnableSplitRdSkip参数为真，计算并累加压缩子CU的代价。如果当前深度的最佳模式不为空且累加的代价大于当前深度的最佳模式的rdCost，则设置skipSplitCheck标志为1并跳出循环
                if (m_param->bEnableSplitRdSkip)
                {
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP);
                    if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
                    {
                        skipSplitCheck = 1;
                        break;
                    }
                }
                else//递归调用compressIntraCU
                    compressIntraCU(parentCTU, childGeom, nextQP);
                //将最佳CU和预测数据保存到splitCU和splitPred中
                // Save best CU and pred data for this sub CU
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                splitPred->addSubCosts(*nd.bestMode);//更新splitPred的代价
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                nextContext = &nd.bestMode->contexts;
            }
            else
            {
                /* record the depth of this non-present sub-CU */
                splitCU->setEmptyPart(childGeom, subPartIdx);

                /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
                if (bAlreadyDecided)
                    memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
            }
        }//如果没有跳过splitCheck如果不可分割，
        if (!skipSplitCheck)
        {   //则将下一个深度的上下文信息保存到splitPred的contexts中
            nextContext->store(splitPred->contexts);
            if (mightNotSplit)//则根据当前CU的深度添加分割标志的代价，否则更新splitPred的模式代价
                addSplitFlagCost(*splitPred, cuGeom.depth);
            else
                updateModeCost(*splitPred);
            //检查分割后的子CU是否需要进行DQP处理，检查分割后的子CU的最佳模式
            checkDQPForSplitPred(*splitPred, cuGeom);
            checkBestMode(*splitPred, depth);
        }
    }
    
    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
    {
        int cuIdx = (cuGeom.childOffset - 1) / 3;
        cacheCost[cuIdx] = md.bestMode->rdCost;
    }

    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
    {
        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
        int8_t maxTUDepth = -1;
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
            maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
    }
    //这个条件是判断是否需要限制最大TU深度并且当前CU的尺寸是否大于等于4
    /* Copy best data to encData CTU and recon */
    md.bestMode->cu.copyToPic(depth);
    if (md.bestMode != &md.pred[PRED_SPLIT])//如果最佳模式不是PRED_SPLIT模式，则将最佳模式的重建数据复制到m_frame->m_reconPic中的对应位置
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);

    return md.bestMode->rdCost;
}

4.Intra模式结果检查Search::checkIntra

对Intra模式进行了一系列的评估和编码操作，包括亮度和色度失真的估计、熵编码、能量计算等。这些评估结果将用于模式选择和RD优化过程中，代码分析如下：

void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
{   //获取Intra模式的CU数据对象cu
    CUData& cu = intraMode.cu;
    //设置CU的分区大小为partSize
    cu.setPartSizeSubParts(partSize);
    cu.setPredModeSubParts(MODE_INTRA);//设置CU的预测模式为Intra模式

    uint32_t tuDepthRange[2];
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
    //通过estIntraPredQT函数估计Intra预测的亮度失真，并将其添加到Intra模式的亮度失真上
    intraMode.initCosts();//初始化Intra模式的代价
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
    if (m_csp != X265_CSP_I400)
    {   //通过estIntraPredChromaQT函数估计Intra预测的色度失真，并将其添加到Intra模式的色度失真上。然后将亮度失真和色度失真相加得到总失真
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
    }
    else
        intraMode.distortion += intraMode.lumaDistortion;
    cu.m_distortion[0] = intraMode.distortion;
    m_entropyCoder.resetBits();//重置熵编码器的比特数
    if (m_slice->m_pps->bTransquantBypassEnabled)
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);

    int skipFlagBits = 0;
    if (!m_slice->isIntra())
    {   //如果当前CU不是帧内编码（非Intra），则编码CU的预测模式
        m_entropyCoder.codeSkipFlag(cu, 0);
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
    }
    //编码CU的分区大小
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
    m_entropyCoder.codePredInfo(cu, 0);//编码CU的预测信息
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
    //根据配置参数决定是否编码CU的DQP（量化参数偏移）
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);//编码CU的系数（变换系数）
    m_entropyCoder.store(intraMode.contexts);
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;//计算Intra模式的系数比特数
    const Yuv* fencYuv = intraMode.fencYuv;
    if (m_rdCost.m_psyRd)//如果启用了PsyRD（心理视觉相关的RD优化），则计算Intra模式的心理能量
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
    else if(m_rdCost.m_ssimRd)
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
    //计算Intra模式的残差能量
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
    //更新Intra模式的代价
    updateModeCost(intraMode);
    checkDQP(intraMode, cuGeom);//检查Intra模式的量化参数偏移（DQP）
}

5.帧内亮度预测模块Search::estIntraPredQT

遍历所有帧内预测模式，得到最佳模式，并进一步测量和编码处理，代码分析如下：

//遍历所有帧内预测模式，得到最佳模式，并进一步测量和编码处理
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
{   //获取Intra模式的CU数据对象cu、重构YUV数据对象reconYuv、预测YUV数据对象predYuv和原始帧YUV数据对象fencYuv
    CUData& cu = intraMode.cu;
    Yuv* reconYuv = &intraMode.reconYuv;
    Yuv* predYuv = &intraMode.predYuv;
    const Yuv* fencYuv = intraMode.fencYuv;
    //获取CU的深度
    uint32_t depth        = cuGeom.depth;
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
    uint32_t numPU        = 1 << (2 * initTuDepth);
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
    uint32_t tuSize       = 1 << log2TrSize;//计算CU中变换单元的分区数
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
    uint32_t sizeIdx      = log2TrSize - 2;
    uint32_t absPartIdx   = 0;
    sse_t totalDistortion = 0;

    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;

    // loop over partitions 使用变量 puIdx 进行分区的迭代，并通过在每次迭代中增加 qNumParts 来更新绝对分区索引 absPartIdx
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
    {
        uint32_t bmode = 0;
        //它检查当前分区的内部模式是否不等于 ALL_IDX
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
        else
        {
            uint64_t candCostList[MAX_RD_INTRA_MODES];
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
            uint64_t bcost;
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
            //一系列的计算来确定分区的最佳内部模式
            {
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);

                // Reference sample smoothing 参考像素平滑滤波
                IntraNeighbors intraNeighbors;
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);//滤波处理

                // determine set of modes to be tested (using prediction signal only)
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
                uint32_t stride = predYuv->m_size;

                int scaleTuSize = tuSize;
                int scaleStride = stride;
                int costShift = 0;
                //用于加载内部模式的熵编码器函数
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
                //getIntraRemModeBits 函数的返回值，并将结果存储在 rbits 变量中。该函数用于计算剩余模式的比特数，并返回最可能的模式和非 MPM（Most Probable Mode）模式的数量
                /* there are three cost tiers for intra modes:
                *  pred[0]          - mode probable, least cost
                *  pred[1], pred[2] - less probable, slightly more cost
                *  non-mpm modes    - all cost the same (rbits) */
                uint64_t mpms;
                uint32_t mpmModes[3];
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);

                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
                uint64_t modeCosts[35];

                // DC 进行 DC 内部预测，将结果存储在 m_intraPred 中
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;//sa8d 函数计算 SAD（Sum of Absolute Differences）代价
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);

                // PLANAR
                pixel* planar = intraNeighbourBuf[0];
                if (tuSize >= 8 && tuSize <= 32)
                    planar = intraNeighbourBuf[1];//选择不同的邻域样本 planar
                //进行 PLANAR 内部预测，将结果存储在 m_intraPred 中
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);

                // angular predictions
                if (primitives.cu[sizeIdx].intra_pred_allangs)
                {   //对角度预测模式进行处理
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);//进行所有角度预测模式的内部预测，将结果存储在
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
                    for (int mode = 2; mode < 35; mode++)
                    {
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                        if (mode < 18)//根据模式的索引，选择适当的邻域样本和计算函数
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                        else
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                        COPY1_IF_LT(bcost, modeCosts[mode]);
                    }
                }
                else
                {
                    for (int mode = 2; mode < 35; mode++)
                    {
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                        COPY1_IF_LT(bcost, modeCosts[mode]);
                    }
                }

                /* Find the top maxCandCount candidate modes with cost within 25% of best
                * or among the most probable modes. maxCandCount is derived from the
                * rdLevel and depth. In general we want to try more modes at slower RD
                * levels and at higher depths */
                for (int i = 0; i < maxCandCount; i++)
                    candCostList[i] = MAX_INT64;
                //将当前最佳模式的代价 bcost 增加 25%（即 bcost 的四分之一），得到 paddedBcost。
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
                for (int mode = 0; mode < 35; mode++)//如果满足上述任一条件，则调用 updateCandList 函数将该模式添加到候选模式列表 rdModeList 中，并更新候选模式的代价列表 candCostList
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);//updateCandList 函数的作用是更新候选模式列表 rdModeList 和候选模式的代价列表 candCostList。它接收一个新的模式、该模式的代价、最大候选模式数、候选模式列表和候选模式的代价列表作为参数，并根据代价的顺序将新的模式插入适当的位置。如果候选模式列表已满，则删除代价最高的模式
            }

            /* measure best candidates using simple RDO (no TU splits) */
            bcost = MAX_INT64;//遍历候选模式列表 rdModeList 中的每个候选模式
            for (int i = 0; i < maxCandCount; i++)
            {
                if (candCostList[i] == MAX_INT64)
                    break;

                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);

                m_entropyCoder.load(m_rqt[depth].cur);//将候选模式应用于当前CU的亮度内部方向
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);

                Cost icosts;
                if (checkTransformSkip)//如果需要检查Transform Skip（跳过变换）
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
                else//使用 codeIntraLumaQT 函数对亮度内部模式进行编码，计算代价 icosts
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);//如果 icosts.rdcost 小于 bcost，则更新 bcost 和最佳模式 bmode
            }
        }

        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
        //使用最佳模式 bmode 将亮度内部方向应用于当前CU
        /* remeasure best mode, allowing TU splits */
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
        m_entropyCoder.load(m_rqt[depth].cur);//载入熵编码器的状态

        Cost icosts;
        if (checkTransformSkip)
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
        else
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
        totalDistortion += icosts.distortion;
        //提取经过量化和反量化的亮度内部预测结果，存储在 reconYuv 中
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
        //如果当前PU不是最后一个PU
        // set reconstruction for next intra prediction blocks
        if (puIdx != numPU - 1)
        {   //将重构结果复制到输出重构图像 reconPic 中的适当位置
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
             * that the contexts should be tracked through each PU */
            PicYuv*  reconPic = m_frame->m_reconPic;
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
            uint32_t dststride = reconPic->m_stride;
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
            uint32_t srcstride = reconYuv->m_size;
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
        }
    }

    if (numPU > 1)
    {   //遍历每个PU的子块，检查并设置 combCbfY（组合CBF标志）。将 combCbfY 的值更新到CU的CBF（coded block flags）中
        uint32_t combCbfY = 0;
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);

        cu.m_cbf[0][0] |= combCbfY;
    }

    // TODO: remove this
    m_entropyCoder.load(m_rqt[depth].cur);

    return totalDistortion;
}

6.帧内色度预测模块Search::estIntraPredQT

遍历候选列表帧内色度预测模式（候选模块总共5个，分别为PLANAR=0、VER=26、HOR=10、DC=1和亮度帧内模式，如果亮度模式为前面四种中的一种，则新增候选一个候选34方向），得到最佳模式，并进一步测量和编码处理，代码分析如下：

//帧内色度预测模式遍历
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
{   //获取当前intraMode的CU数据和重构YUV图像对象
    CUData& cu = intraMode.cu;
    Yuv& reconYuv = intraMode.reconYuv;
    //提取CU的深度信息和初始TU深度信息。初始TU深度取决于CU的分割模式和色度空间
    uint32_t depth       = cuGeom.depth;
    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
    uint32_t absPartStep = cuGeom.numPartitions;
    sse_t totalDistortion = 0;

    int size = partitionFromLog2Size(log2TrSize);
    //创建一个TU迭代器（TURecurse），用于遍历CU中的变换块。如果初始TU深度为0，则不进行TU分割，否则进行四分割（QUAD_SPLIT）
    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);

    do
    {
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;

        uint32_t bestMode = 0;
        sse_t bestDist = 0;
        uint64_t bestCost = MAX_INT64;

        // init mode list
        uint32_t minMode = 0;
        uint32_t maxMode = NUM_CHROMA_MODE;
        uint32_t modeList[NUM_CHROMA_MODE];
        //根据情况初始化模式列表 modeList。如果当前CU的色度内部方向不是 ALL_IDX（表示所有模式），并且初始TU深度为0，则将模式列表中的所有元素设置为当前CU的色度内部方向。
        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
        {
            for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
            maxMode = 1;
        }
        else//否则，使用 cu.getAllowedChromaDir 函数获取允许的色度方向列表
            cu.getAllowedChromaDir(absPartIdxC, modeList);
        //如果输入图像的色彩空间是 I400，而当前色彩空间不是 I400，则将模式列表中除第一个元素外的所有元素设置为第一个元素的值，并将 maxMode 设置为1
        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
        {
            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
                modeList[l] = modeList[0];
            maxMode = 1;
        }
        // check chroma modes
        for (uint32_t mode = minMode; mode < maxMode; mode++)
        {   //恢复上下文模型状态，加载当前深度下的熵编码器状态
            // restore context models
            m_entropyCoder.load(m_rqt[depth].cur);
            //设置CU的色度内部方向为当前模式
            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
            Cost outCost;//调用 codeIntraChromaQt 函数对色度内部模式进行编码，并计算编码输出的代价
            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);

            if (m_slice->m_pps->bTransformSkipEnabled)
                m_entropyCoder.load(m_rqt[depth].cur);
            //重置熵编码器的比特数计数
            m_entropyCoder.resetBits();
            // chroma prediction mode 根据CU的分割模式和色彩空间，对色度预测模式进行编码
            if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
            {
                if (!absPartIdxC)
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
            }
            else
            {
                uint32_t qNumParts = cuGeom.numPartitions >> 2;
                if (!(absPartIdxC & (qNumParts - 1)))
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
            }

            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);//对色度分量进行变换系数的编码
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
            uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();//获取编码比特数，并根据失真和比特数计算代价
            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
                                             : m_rdCost.calcRdCost(outCost.distortion, bits);

            if (cost < bestCost)
            {   //如果当前代价比最佳代价小，更新最佳数据
                bestCost = cost;
                bestDist = outCost.distortion;
                bestMode = modeList[mode];
                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
                memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
            }
        }
        //如果当前不是最后一个变换块
        if (!tuIterator.isLastSection())
        {   //计算当前变换块在Z扫描顺序中的位置 zorder
            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
            PicYuv*  reconPic  = m_frame->m_reconPic;
            uint32_t dststride = reconPic->m_strideC;
            const pixel* src;
            pixel* dst;
            //获取重构图像的Cb和Cr分量的地址和步长，将重构YUV图像中的Cb和Cr分量复制到重构图像中的相应位置
            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
            src = reconYuv.getCbAddr(absPartIdxC);
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);

            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
            src = reconYuv.getCrAddr(absPartIdxC);
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
        }

        memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
        memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
        memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
        memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
        totalDistortion += bestDist;
    }
    while (tuIterator.isNextSection());

    if (initTuDepth != 0)
    {   //初始化组合CBF的变量 combCbfU 和 combCbfV 为0， 计算每个子块的数量 qNumParts
        uint32_t combCbfU = 0;
        uint32_t combCbfV = 0;
        uint32_t qNumParts = tuIterator.absPartIdxStep;
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
        {   //对于每个子块，累积CU的色度分量U和V的CBF标志到组合CBF变量中
            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
        }
        //将组合CBF值与CU的第一个子块的CBF值进行按位或操作，更新CU的CBF数组
        cu.m_cbf[1][0] |= combCbfU;
        cu.m_cbf[2][0] |= combCbfV;
    }

    /* TODO: remove this */
    m_entropyCoder.load(m_rqt[depth].cur);
    return totalDistortion;
}

点赞、收藏，会是我继续写作的动力！赠人玫瑰，手有余香。