一、NPP介绍
NPP分为两个部分,分别为Image和Signal,详细参考What is NPP?
官网的文档中缺乏代码示例,结合网上能搜到的demo和自己的项目,以Image-Processing模块为例(也就是nppi模块),默认大家已经阅读了(如果没有阅读简单介绍,也可以到网上去搜搜翻译),直接怼代码。
二、示例
我首先介绍几个示例,分别为copy,set,resize,multiply,其中multiply有两种情况分别为InPlace和非InPlace的情况。
这些操作设计几个重要的概念:
1、NppiSize和pSrc
Sub-Region(ROI操作)在图片处理中很常见,很多的操作都是在局部区域进行操作,而非在全局,关于这一点,nppi提供了丰富的接口,其中后缀为R表示支持ROI操作。
不过实现这个功能传入的参数时pSrc和oSizeROI,而不是如我们在OpenCV中使用的cv::Rect给出的x,y,width和height,这里需要使用pSrc定位到TopLeft点,官方给出的公式如下:
pSrcOffset = pSrc + y * nSrcStep + x * PixelSize
其中SrcStep为待操作图片的实际LineStep大小,而不是nppiMalloc_<>等2D内存分配api分配过程获取的linestep,nSrcStep可以由下列公式获取
nSrcStep = nSrcWidth * NumberOfColorChannels * sizeof(PixelDataType);
而PixelSize由如下公式获取:
PixelSize = NumberOfColorChannels * sizeof(PixelDataType)。
2、NppiRect
并不是所有的ROI操作都是需要指定pSrc(自己计算偏移量)和OSizeROI,有些nppi的api,如nppiResize可以直接提供NppiRect来进行操作。如果你仔细阅读官网的文档,你会发现还需要提供oSrcSize和oDstSize,这两个参数表示原图(source)和结果图(destination)的全图大小。
3、LineStep
linestep这个概念贯穿使用npp的过程,其中因为内存对齐的原因,可能nppiMalloc分配api分配的内存每行linestep的大小大于图片实际的大小,因此需要查看并与图片实际的大小进行比较,以图片实际的大小传入各api中。
在OpenCV中,获取cv::Mat的step大小,只需要直行mat.step就可以获取,这在示例中有表明。
int main()
{
cv::Mat matSrc = cv::imread("./test.jpg");
int nSrcWidth = matSrc.cols;
int nSrcHeight = matSrc.rows;
int nSrcLineStep = 0;
Npp8u *pbySrc = nppiMalloc_8u_C3(nSrcWidth, nSrcHeight, &nSrcLineStep);
printf("nSrcLineStep = %d \n", nSrcLineStep);
cudaMemcpy(pbySrc, matSrc.data, nSrcHeight * nSrcWidth * 3 *sizeof(unsigned char), cudaMemcpyHostToDevice);
// test nppi copy
int nCopyWidth = 400;
int nCopyHeight = 400;
int nCopyLineStep = 0;
Npp8u *pbyCopy = nppiMalloc_8u_C3(nCopyWidth, nSrcHeight, &nCopyLineStep);
printf("nCopyLineStep = %d \n", nCopyLineStep);
NppiSize oCopySrcSizeROI = {400, 400};
Npp8u *pbySrcTopLeft = pbySrc + 200 * nSrcWidth * 3 + 200 * 3 * sizeof(Npp8u);
nppiCopy_8u_C3R(pbySrcTopLeft, nSrcWidth * 3, pbyCopy, 1200, oCopySrcSizeROI);
cv::Mat matCopy;
matCopy.create(nCopyHeight, nCopyWidth, CV_8UC3);
cudaMemcpy(matCopy.data, pbyCopy, nCopyHeight * nCopyWidth * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cv::imwrite("./test_copy.jpg", matCopy);
std::cout << "matCopy Width, Height and Step: " << matCopy.cols << ", " << matCopy.rows << ", " << matCopy.step << std::endl;
nppiFree(pbyCopy);
// test nppi set
int nSetWidth = 400;
int nSetHeight = 400;
int nSetLineStep = 0;
Npp8u *pbySet = nppiMalloc_8u_C3(nSetWidth, nSetHeight, &nSetLineStep);
printf("nSetLineStep = %d \n", nSetLineStep);
Npp8u anSetDstValue[3] = {110, 110, 110};
NppiSize oSetDstSizeROI = {400, 400};
Npp8u anSetDstValue2[3] = {255, 255, 255};
NppiSize oSetDstSizeROI2 = {100, 100};
nppiSet_8u_C3R(anSetDstValue, pbySet, 1200, oSetDstSizeROI);
nppiSet_8u_C3R(anSetDstValue2, pbySet, 1200, oSetDstSizeROI2);
cv::Mat matSet;
matSet.create(nSetHeight, nSetWidth, CV_8UC3);
cudaMemcpy(matSet.data, pbySet, nSetHeight * nSetWidth * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cv::imwrite("./test_set.jpg", matSet);
std::cout << "matSet Width, Height and Step: " << matSet.cols << ", " << matSet.rows <<", " << matSet.step << std::endl;
nppiFree(pbySet);
// test nppi resize
int nResizeWidth = 600;
int nResizeHeight = 600;
int nResizeLineStep = 0;
Npp8u *pbyResize = nppiMalloc_8u_C3(nResizeWidth, nResizeHeight, &nResizeLineStep);
printf("nResizeLineStep = %d \n", nResizeLineStep);
int nResizeSrcLineStep = nSrcWidth * 3 * sizeof(unsigned char);
int nResizeDstLineStep = nResizeWidth * 3 * sizeof(unsigned char);
NppiSize oResizeSrcSize = {nSrcWidth, nSrcHeight};
NppiSize oResizeDstSize = {nResizeWidth, nResizeHeight};
NppiRect oResizeSrcRect = {200, 200, 500, 500};
NppiRect oResizeDstRect = {0, 0, nResizeWidth, nResizeHeight};
nppiResize_8u_C3R(pbySrc, nResizeSrcLineStep, oResizeSrcSize, oResizeSrcRect,
pbyResize, nResizeDstLineStep, oResizeDstSize, oResizeDstRect, 2);
cv::Mat matResize;
matResize.create(nResizeHeight, nResizeWidth, CV_8UC3);
cudaMemcpy(matResize.data, pbyResize, nResizeHeight * nResizeWidth * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cv::imwrite("./test_resize.jpg", matResize);
std::cout << "matResize Width, Heigth and Step: " << matResize.cols << ", " << matResize.rows << ", " << matResize.step << std::endl;
nppiFree(pbyResize);
// test nppi multiply(and in-place, Sfs)
int nMulWidth = 400;
int nMulHeight = 400;
int nMulLineStep = 0;
Npp8u *pbyMul = nppiMalloc_8u_C3(nMulWidth, nMulHeight, &nMulLineStep);
Npp8u *pbyMulResult = nppiMalloc_8u_C3(nMulWidth, nMulHeight, &nMulLineStep);
printf("nSetLineStep =%d \n", nMulLineStep);
Npp8u anMulSetDstValue[3] = {0, 0, 0};
NppiSize oMulSetDstSizeROI = {400, 400};
Npp8u anMulSetDstValue2[3] = {1, 1, 1};
NppiSize oMulSetDstSizeROI2 = {100, 100};
int nMulSetSrcLineStep = nSrcWidth * 3 * sizeof(unsigned char);
int nMulSetDstLineStep = nMulWidth * 3 * sizeof(unsigned char);
nppiSet_8u_C3R(anMulSetDstValue, pbyMul, nMulSetDstLineStep, oMulSetDstSizeROI);
nppiSet_8u_C3R(anMulSetDstValue2, pbyMul, nMulSetDstLineStep, oMulSetDstSizeROI2);
NppiSize oMulDstSizeROI = {nMulWidth, nMulHeight};
nppiMul_8u_C3RSfs(pbySrc, nMulSetSrcLineStep, pbyMul, nMulSetDstLineStep, pbyMulResult, nMulSetDstLineStep, oMulDstSizeROI, 0);
nppiMul_8u_C3IRSfs(pbySrc, nMulSetSrcLineStep, pbyMul, nMulSetDstLineStep, oMulDstSizeROI, 0);
cv::Mat matMul; // save matMul
matMul.create(nMulHeight, nMulWidth, CV_8UC3);
cudaMemcpy(matMul.data, pbyMulResult, nMulWidth * nMulHeight * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cv::imwrite("./test_mul.jpg", matMul);
std::cout << "matMul Width, Height and Step: " << matMul.cols << ", " << matMul.rows << ", " << matMul.step << std::endl;
cv::Mat matMulInPlace; // save matMulInPlace
matMulInPlace.create(nMulHeight, nMulWidth, CV_8UC3);
cudaMemcpy(matMulInPlace.data, pbyMul, nMulWidth * nMulHeight * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cv::imwrite("./test_mulInPlace.jpg", matMulInPlace);
std::cout << "matMulInPlace Width, Height and Step: " << matMulInPlace.cols << ", " << matMulInPlace.rows << ", " << matMulInPlace.step << std::endl;
nppiFree(pbyMul);
nppiFree(pbyMulResult);
// free cuda memory
nppiFree(pbySrc);
printf("end \n");
return 0;
}