NIVIDIA 硬解码学习4
做项目的时候遇到一个问题. 就是起了多个解码器,但是解码器各自拥有其上下文,则获得各自解码器解码得到的GPU数据.无法共同操作.
可以参考下面这个程序解决这个问题.
AppDecMultiInput
- This sample application demonstrates shows how to decode multiple raw video files andpost-process them with CUDA kernels on different CUDA streams.
程序功能介绍:
- 读入一个文件.
- 起了多个解码器(程序中为4个)
- 对这个文件各自进行解码,然后每个解码器在这个解码后数据上做了一个波纹的特效.
- 最后把解码的数据Merge到一起.写到一个输出文件
输入:
输出:
代码介绍
- 主函数
int main(int argc, char *argv[])
{
char szInFilePath[256] = "", szOutFilePath[256] = "out.nv12";
int iGpu = 0;
std::vector<std::exception_ptr> vExceptionPtrs;
try
{
// 从命令行读入文件
ParseCommandLine(argc, argv, szInFilePath, szOutFilePath, iGpu);
CheckInputFile(szInFilePath);
// 初始化GPU环境
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
if (iGpu < 0 || iGpu >= nGpu)
{
std::ostringstream err;
err << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl;
throw std::invalid_argument(err.str());
}
CUdevice cuDevice = 0;
ck(cuDeviceGet(&cuDevice, iGpu));
char szDeviceName[80];
ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
std::cout << "GPU in use: " << szDeviceName << std::endl;
// 创建一个CUDA上下文,后面解码器共享这个上下文!!!
CUcontext cuContext = NULL;
ck(cuCtxCreate(&cuContext, 0, cuDevice));
// 解析,用来初始化解码器使用
FFmpegDemuxer demuxer(szInFilePath);
int nWidth = demuxer.GetWidth(), nHeight = demuxer.GetHeight(), nByte = nWidth * nHeight * 3 / 2;
// Number of decoders
const int n = 4; // 解码器数目
// Every decoder has its own round queue
uint8_t *aapFrameBuffer[n][8]; // 用来存储最后解码出来的数据的地址
// Queue capacity
const int nFrameBuffer = sizeof(aapFrameBuffer[0]) / sizeof(aapFrameBuffer[0][0]);
int iEnd = nFrameBuffer;
bool abStop[n] = {};
int aiHead[n] = {};
std::vector <NvThread> vThreads;
std::vector <std::unique_ptr<NvDecoder>> vDecoders;
// Coordinate of the ripple center for each decoder
int axCenter[] = { nWidth / 4, nWidth / 4 * 3, nWidth / 4, nWidth / 4 * 3 }; // 每个波纹的x坐标中心
int ayCenter[] = { nHeight / 4, nHeight / 4, nHeight / 4 * 3, nHeight / 4 * 3 }; // 每个波纹的y坐标中心
cudaStream_t aStream[n];
vExceptionPtrs.resize(n);
for (int i = 0; i < n; i++)
{
ck(cudaStreamCreate(&aStream[i]));// 创建4个流
std::unique_ptr<NvDecoder> dec(new NvDecoder(cuContext, demuxer.GetWidth(), demuxer.GetHeight(), true, FFmpeg2NvCodecId(demuxer.GetVideoCodec()))); // 创建解码器
vDecoders.push_back(std::move(dec)); // 加入队列
vThreads.push_back(NvThread(std::thread(DecProc, vDecoders[i].get(), szInFilePath, nWidth, nHeight, aapFrameBuffer[i],
nFrameBuffer, &iEnd, aiHead + i, abStop + i, aStream[i], axCenter[i], ayCenter[i], std::ref(vExceptionPtrs[i]))));// 启动线程进行解码
}
std::unique_ptr<uint8_t[]> pImage(new uint8_t[nByte]);// 最后存储到CPU内存用于写文件
uint8_t* dpImage = nullptr;// 分配GPU内存,最后Merge到一起
ck(cudaMalloc(&dpImage, nByte));
std::ofstream fpOut(szOutFilePath, std::ios::out | std::ios::binary);
if (!fpOut)
{
std::ostringstream err;
err << "Unable to open output file: " << szOutFilePath << std::endl;
throw std::invalid_argument(err.str());
}
int nFrame = 0;
for (int i = 0;; i++)
{
// For each decoded frame #i
// iHead is used for ensuring all decoders have made progress
int iHead = INT_MAX;
for (int j = 0; j < n; j++)
{
while (!abStop[j] && aiHead[j] <= i)
{
// Decoder #j hasn't decoded frame #i
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
iHead = (std::min)(iHead, aiHead[j]);
}
if (iHead <= i)
{
// Some decoder stops
nFrame = i;
break;
}
std::cout << "Merge frames at #" << i << "\r";// 第i帧
uint8_t *apNv12[] = { aapFrameBuffer[0][i % nFrameBuffer], aapFrameBuffer[1][i % nFrameBuffer], aapFrameBuffer[2][i % nFrameBuffer], aapFrameBuffer[3][i % nFrameBuffer] };
// Merge all frames into dpImage
LaunchMerge(0, dpImage, apNv12, n, nWidth, nHeight);
ck(cudaMemcpy(pImage.get(), dpImage, nByte, cudaMemcpyDeviceToHost));
fpOut.write(reinterpret_cast<char*>(pImage.get()), nByte);
for (int j = 0; j < n; j++)
{
vDecoders[j]->UnlockFrame(&aapFrameBuffer[j][i % nFrameBuffer], 1);
}
iEnd++;
}
fpOut.close();
ck(cudaFree(dpImage));
for (int i = 0; i < n; i++)
{
if (vExceptionPtrs[i])
{
std::rethrow_exception(vExceptionPtrs[i]);
}
}
ck(cudaProfilerStop());
if (nFrame)
{
std::cout << "Merged video saved in " << szOutFilePath << ". A total of " << nFrame << " frames were decoded." << std::endl;
return 0;
}
else
{
std::cout << "Warning: no video frame decoded. Please don't use container formats (such as mp4/avi/webm) as the input, but use raw elementary stream file instead." << std::endl;
return 1;
}
}
catch (const std::exception &ex)
{
std::cout << ex.what();
exit(1);
}
return 0;
}
- 创建解码器
NvDecoder::NvDecoder(CUcontext cuContext, int nWidth, int nHeight, bool bUseDeviceFrame, cudaVideoCodec eCodec, std::mutex *pMutex,
bool bLowLatency, bool bDeviceFramePitched, const Rect *pCropRect, const Dim *pResizeDim, int maxWidth, int maxHeight) :
m_cuContext(cuContext), m_bUseDeviceFrame(bUseDeviceFrame), m_eCodec(eCodec), m_pMutex(pMutex), m_bDeviceFramePitched(bDeviceFramePitched),
m_nMaxWidth (maxWidth), m_nMaxHeight(maxHeight)
{
if (pCropRect) m_cropRect = *pCropRect;
if (pResizeDim) m_resizeDim = *pResizeDim;
NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext));
// CUVIDPARSERPARAMS:该接口用来创建VideoParser
// 主要参数是设置三个回调函数 实现对解析出来的数据的处理
CUVIDPARSERPARAMS videoParserParameters = {};
videoParserParameters.CodecType = eCodec;
videoParserParameters.ulMaxNumDecodeSurfaces = 1;
videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1;
videoParserParameters.pUserData = this;
// 三个回调函数
videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
videoParserParameters.pfnDisplayPicture = HandlePictureDisplayProc;
if (m_pMutex) m_pMutex->lock();
NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters));
if (m_pMutex) m_pMutex->unlock();
}
- 具体每个解码器的解码线程
void DecProc(NvDecoder *pDec, const char *szInFilePath, int nWidth, int nHeight, uint8_t **apFrameBuffer,
int nFrameBuffer, int *piEnd, int *piHead, bool *pbStop, cudaStream_t stream,
int xCenter, int yCenter, std::exception_ptr &ex)
{
try
{
printf("--> enter DecProc\n");
FFmpegDemuxer demuxer(szInFilePath); // 解封装
ck(cuCtxSetCurrent(pDec->GetContext())); // 将main中创建的cuda-context设置为当前上下文
uint8_t *dpRippleImage;
ck(cudaMalloc(&dpRippleImage, nWidth * nHeight));
int iTime = 0;
// Render a ripple image on dpRippleImage
//LaunchRipple(stream, dpRippleImage, nWidth, nHeight, xCenter, yCenter, iTime++);
int nVideoBytes = 0/*读取数据大小*/, nFrameReturned /*解码帧数*/ = 0, nFrame = 0;
uint8_t *pVideo = NULL, **ppFrame;
do
{
demuxer.Demux(&pVideo, &nVideoBytes); // 解析数据
pDec->DecodeLockFrame(pVideo, nVideoBytes, &ppFrame, &nFrameReturned);
for (int i = 0; i < nFrameReturned; i++) { // 解码完成的帧数
// For each decoded frame
while (*piHead == *piEnd) {
// Queue is full
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
// Frame buffer is locked, so no data copy is needed here
apFrameBuffer[*piHead % nFrameBuffer] = ppFrame[i]; // 将指针赋给外部的apFrameBuffer
// Overlay dpRippleImage onto the frame buffer 将波纹效果叠加到解码后的数据
LaunchOverlayRipple(stream, apFrameBuffer[*piHead % nFrameBuffer], dpRippleImage, nWidth, nHeight);
// Make sure CUDA kernel is finished before marking the current position as ready
ck(cudaStreamSynchronize(stream));
// Mark as ready
++*piHead;
// 更新波纹效果(iTime不同)
LaunchRipple(stream, dpRippleImage, nWidth, nHeight, xCenter, yCenter, iTime++);
}
} while (nVideoBytes);
ck(cudaFree(dpRippleImage)); // 全部解码完成释放内存
*pbStop = true;
}
catch (std::exception&)
{
ex = std::current_exception();
}
}