NIVIDIA 硬解码学习4

最新推荐文章于 2022-10-28 15:11:35 发布

桑来93

最新推荐文章于 2022-10-28 15:11:35 发布

阅读量1.7k

点赞数

分类专栏： [CUDA笔记]

本文链接：https://blog.csdn.net/qjh5606/article/details/82717140

版权

[CUDA笔记] 专栏收录该内容

25 篇文章 13 订阅

订阅专栏

NIVIDIA 硬解码学习4

做项目的时候遇到一个问题. 就是起了多个解码器,但是解码器各自拥有其上下文,则获得各自解码器解码得到的GPU数据.无法共同操作.

可以参考下面这个程序解决这个问题.

AppDecMultiInput

This sample application demonstrates shows how to decode multiple raw video files andpost-process them with CUDA kernels on different CUDA streams.

程序功能介绍:

读入一个文件.
起了多个解码器(程序中为4个)
对这个文件各自进行解码,然后每个解码器在这个解码后数据上做了一个波纹的特效.
最后把解码的数据Merge到一起.写到一个输出文件

输入:

这里写图片描述

输出:
这里写图片描述

代码介绍

主函数

int main(int argc, char *argv[])
{
    char szInFilePath[256] = "", szOutFilePath[256] = "out.nv12";
    int iGpu = 0;
    std::vector<std::exception_ptr> vExceptionPtrs;
    try
    {
        // 从命令行读入文件
        ParseCommandLine(argc, argv, szInFilePath, szOutFilePath, iGpu);
        CheckInputFile(szInFilePath);
        // 初始化GPU环境
        ck(cuInit(0));
        int nGpu = 0;
        ck(cuDeviceGetCount(&nGpu));
        if (iGpu < 0 || iGpu >= nGpu)
        {
            std::ostringstream err;
            err << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl;
            throw std::invalid_argument(err.str());

        }
        CUdevice cuDevice = 0;
        ck(cuDeviceGet(&cuDevice, iGpu));
        char szDeviceName[80];
        ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
        std::cout << "GPU in use: " << szDeviceName << std::endl;
        // 创建一个CUDA上下文,后面解码器共享这个上下文!!!
        CUcontext cuContext = NULL;
        ck(cuCtxCreate(&cuContext, 0, cuDevice));

        // 解析,用来初始化解码器使用
        FFmpegDemuxer demuxer(szInFilePath);
        int nWidth = demuxer.GetWidth(), nHeight = demuxer.GetHeight(), nByte = nWidth * nHeight * 3 / 2;

        // Number of decoders
        const int n = 4; // 解码器数目
        // Every decoder has its own round queue
        uint8_t *aapFrameBuffer[n][8]; // 用来存储最后解码出来的数据的地址
        // Queue capacity
        const int nFrameBuffer = sizeof(aapFrameBuffer[0]) / sizeof(aapFrameBuffer[0][0]);
        int iEnd = nFrameBuffer;
        bool abStop[n] = {};
        int aiHead[n] = {};
        std::vector <NvThread> vThreads;
        std::vector <std::unique_ptr<NvDecoder>> vDecoders;
        // Coordinate of the ripple center for each decoder
        int axCenter[] = { nWidth / 4, nWidth / 4 * 3, nWidth / 4, nWidth / 4 * 3 }; // 每个波纹的x坐标中心
        int ayCenter[] = { nHeight / 4, nHeight / 4, nHeight / 4 * 3, nHeight / 4 * 3 }; // 每个波纹的y坐标中心
        cudaStream_t aStream[n]; 
        vExceptionPtrs.resize(n);
        for (int i = 0; i < n; i++)
        {
            ck(cudaStreamCreate(&aStream[i]));// 创建4个流
            std::unique_ptr<NvDecoder> dec(new NvDecoder(cuContext, demuxer.GetWidth(), demuxer.GetHeight(), true, FFmpeg2NvCodecId(demuxer.GetVideoCodec()))); // 创建解码器
            vDecoders.push_back(std::move(dec)); // 加入队列
            vThreads.push_back(NvThread(std::thread(DecProc, vDecoders[i].get(), szInFilePath, nWidth, nHeight, aapFrameBuffer[i],
                nFrameBuffer, &iEnd, aiHead + i, abStop + i, aStream[i], axCenter[i], ayCenter[i], std::ref(vExceptionPtrs[i]))));// 启动线程进行解码
        } 

        std::unique_ptr<uint8_t[]> pImage(new uint8_t[nByte]);// 最后存储到CPU内存用于写文件
        uint8_t* dpImage = nullptr;// 分配GPU内存,最后Merge到一起
        ck(cudaMalloc(&dpImage, nByte));
        std::ofstream fpOut(szOutFilePath, std::ios::out | std::ios::binary);
        if (!fpOut)
        {
            std::ostringstream err;
            err << "Unable to open output file: " << szOutFilePath << std::endl;
            throw std::invalid_argument(err.str());
        }

        int nFrame = 0;
        for (int i = 0;; i++)
        {
            // For each decoded frame #i
            // iHead is used for ensuring all decoders have made progress
            int iHead = INT_MAX;
            for (int j = 0; j < n; j++)
            {
                while (!abStop[j] && aiHead[j] <= i)
                {
                    // Decoder #j hasn't decoded frame #i
                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                }
                iHead = (std::min)(iHead, aiHead[j]);
            }
            if (iHead <= i)
            {
                // Some decoder stops
                nFrame = i;
                break;
            }

            std::cout << "Merge frames at #" << i << "\r";// 第i帧
            uint8_t *apNv12[] = { aapFrameBuffer[0][i % nFrameBuffer], aapFrameBuffer[1][i % nFrameBuffer], aapFrameBuffer[2][i % nFrameBuffer], aapFrameBuffer[3][i % nFrameBuffer] };
            // Merge all frames into dpImage
            LaunchMerge(0, dpImage, apNv12, n, nWidth, nHeight);
            ck(cudaMemcpy(pImage.get(), dpImage, nByte, cudaMemcpyDeviceToHost));
            fpOut.write(reinterpret_cast<char*>(pImage.get()), nByte);

            for (int j = 0; j < n; j++)
            {
                vDecoders[j]->UnlockFrame(&aapFrameBuffer[j][i % nFrameBuffer], 1);
            }
            iEnd++;
        }
        fpOut.close();
        ck(cudaFree(dpImage));

        for (int i = 0; i < n; i++)
        {
            if (vExceptionPtrs[i])
            {
                std::rethrow_exception(vExceptionPtrs[i]);
            }
        }

        ck(cudaProfilerStop());
        if (nFrame)
        {
            std::cout << "Merged video saved in " << szOutFilePath << ". A total of " << nFrame << " frames were decoded." << std::endl;
            return 0;
        }
        else
        {
            std::cout << "Warning: no video frame decoded. Please don't use container formats (such as mp4/avi/webm) as the input, but use raw elementary stream file instead." << std::endl;
            return 1;
        }
    }
    catch (const std::exception &ex)
    {
        std::cout << ex.what();
        exit(1);
    }
    return 0;
}

创建解码器

NvDecoder::NvDecoder(CUcontext cuContext, int nWidth, int nHeight, bool bUseDeviceFrame, cudaVideoCodec eCodec, std::mutex *pMutex,
    bool bLowLatency, bool bDeviceFramePitched, const Rect *pCropRect, const Dim *pResizeDim, int maxWidth, int maxHeight) :
    m_cuContext(cuContext), m_bUseDeviceFrame(bUseDeviceFrame), m_eCodec(eCodec), m_pMutex(pMutex), m_bDeviceFramePitched(bDeviceFramePitched),
    m_nMaxWidth (maxWidth), m_nMaxHeight(maxHeight)
{
    if (pCropRect) m_cropRect = *pCropRect;
    if (pResizeDim) m_resizeDim = *pResizeDim;

    NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext));
    // CUVIDPARSERPARAMS：该接口用来创建VideoParser
    // 主要参数是设置三个回调函数 实现对解析出来的数据的处理
    CUVIDPARSERPARAMS videoParserParameters = {};
    videoParserParameters.CodecType = eCodec;
    videoParserParameters.ulMaxNumDecodeSurfaces = 1;
    videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1;
    videoParserParameters.pUserData = this;
    // 三个回调函数
    videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
    videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
    videoParserParameters.pfnDisplayPicture = HandlePictureDisplayProc;

    if (m_pMutex) m_pMutex->lock();
    NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters));
    if (m_pMutex) m_pMutex->unlock();
}

具体每个解码器的解码线程

void DecProc(NvDecoder *pDec, const char *szInFilePath, int nWidth, int nHeight, uint8_t **apFrameBuffer,
    int nFrameBuffer, int *piEnd, int *piHead, bool *pbStop, cudaStream_t stream, 
    int xCenter, int yCenter, std::exception_ptr &ex) 
{
    try
    {
        printf("--> enter DecProc\n");
        FFmpegDemuxer demuxer(szInFilePath); // 解封装
        ck(cuCtxSetCurrent(pDec->GetContext())); // 将main中创建的cuda-context设置为当前上下文
        uint8_t *dpRippleImage;
        ck(cudaMalloc(&dpRippleImage, nWidth * nHeight));
        int iTime = 0;
        // Render a ripple image on dpRippleImage
        //LaunchRipple(stream, dpRippleImage, nWidth, nHeight, xCenter, yCenter, iTime++);
        int nVideoBytes = 0/*读取数据大小*/, nFrameReturned /*解码帧数*/ = 0, nFrame = 0;
        uint8_t *pVideo = NULL, **ppFrame;

        do
        {
            demuxer.Demux(&pVideo, &nVideoBytes); // 解析数据
            pDec->DecodeLockFrame(pVideo, nVideoBytes, &ppFrame, &nFrameReturned);
            for (int i = 0; i < nFrameReturned; i++) { // 解码完成的帧数
                // For each decoded frame
                while (*piHead == *piEnd) {
                    // Queue is full
                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                }
                // Frame buffer is locked, so no data copy is needed here
                apFrameBuffer[*piHead % nFrameBuffer] = ppFrame[i]; // 将指针赋给外部的apFrameBuffer
                // Overlay dpRippleImage onto the frame buffer 将波纹效果叠加到解码后的数据
                LaunchOverlayRipple(stream, apFrameBuffer[*piHead % nFrameBuffer], dpRippleImage, nWidth, nHeight);
                // Make sure CUDA kernel is finished before marking the current position as ready
                ck(cudaStreamSynchronize(stream));
                // Mark as ready
                ++*piHead;
                // 更新波纹效果(iTime不同)
                LaunchRipple(stream, dpRippleImage, nWidth, nHeight, xCenter, yCenter, iTime++);
            }
        } while (nVideoBytes);

        ck(cudaFree(dpRippleImage)); // 全部解码完成释放内存
        *pbStop = true;
    }
    catch (std::exception&)
    {
        ex = std::current_exception();
    }
}

桑来93

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
2
评论
NIVIDIA 硬解码学习4

NIVIDIA 硬解码学习4 做项目的时候遇到一个问题. 就是起了多个解码器,但是解码器各自拥有其上下文,则获得各自解码器解码得到的GPU数据.无法共同操作. 可以参考下面这个程序解决这个问题. AppDecMultiInput This sample application demonstrates shows how to decode multi...
复制链接

扫一扫