1 最近在看CUDA优化,看到纹理内存部分,可以使用纹理内存加速。
可是使用纹理内存后图像的处理速度反而变慢了。不解????
2 使用纹理内存代码
//Sobel 边缘提取 使用纹理内存
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include"opencv2\opencv.hpp"
using namespace std;
using namespace cv;
//声明纹理参考系
texture<uchar,2,cudaReadModeElementType> tex;//2d texture
//设备端kernel函数
__global__ void ImgEdge_3x3_S_kernel(uchar* pImgOut,
int nWidth,
int nHeight,
int nWidthStep)
{
const int ix=blockIdx.x*blockDim.x+threadIdx.x;
const int iy=blockIdx.y*blockDim.y+threadIdx.y;
int point=ix+iy*nWidthStep;
if(ix<nWidth&&iy<nHeight)
{
uchar point00=tex2D(tex,(float)ix-1,(float)iy-1);
uchar point01=tex2D(tex,(float)ix,(float)iy-1);
uchar point02=tex2D(tex,(float)ix+1,(float)iy-1);
uchar point10=tex2D(tex,(float)ix-1,(float)iy);
uchar point11=tex2D(tex,(float)ix,(float)iy);
uchar point12=tex2D(tex,(float)ix+1,(float)iy);
uchar point20=tex2D(tex,(float)ix-1,(float)iy+1);
uchar point21=tex2D(tex,(float)ix,(float)iy+1);
uchar point22=tex2D(tex,(float)ix+1,(float)iy+1);
int Horz=point02+2*point12+point22-point00-2*point10-point20;
int Vert=point00+2*point01+point02-point20-2*point21-point22;
int Sum=abs(Horz)+abs(Vert);
if(Sum>255) pImgOut[point]=255;
else pImgOut[point]=(uchar)Sum;
}
}
//设备端代码
double cudaImgEdge(uchar* pImgOut,
uchar* pImgIn,
int nWidth,
int nHeight,
int nWidthStep,
int nChannels)
{
//准备设备端空间
uchar* d_pImgInGPU;
uchar* d_pImgOutGPU;
cudaMalloc((void**)&d_pImgInGPU,nWidthStep*nHeight*sizeof(uchar));
cudaMalloc((void**)&d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar));
//数据初始化
cudaMemcpy(d_pImgInGPU,pImgIn,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyHostToDevice);
cudaMemset(d_pImgOutGPU,0,nWidthStep*nHeight*sizeof(uchar));
//建立CUDA二维数组
cudaArray* cuArray;
cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc<uchar>();//CUDA数组的描述(组件数量和数据类型)
cudaMallocArray(&cuArray,&channelDesc,nWidth,nHeight);
cudaMemcpyToArray(cuArray,0,0,pImgIn,sizeof(uchar)*nWidth*nHeight,cudaMemcpyHostToDevice);
//tex.AddressMode[0]=cudaAddressModeWrap;
//tex.AddressMode[1]=cudaAddressModeWrap;
//tex.normalized=true;
//将显存数据与纹理绑定
cudaBindTextureToArray(&tex,cuArray,&channelDesc);
//启动kernel进行并行处理
dim3 threads(16,16);
dim3 grid((nWidth*nChannels+threads.x-1)/threads.x,(nHeight+threads.y-1)/threads.y);
ImgEdge_3x3_S_kernel<<<grid,threads>>>(d_pImgOutGPU,nWidth,nHeight,nWidthStep);
//解绑定
cudaUnbindTexture(tex);
cudaFreeArray(cuArray);
//数据输出
cudaMemcpy(pImgOut,d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyDeviceToHost);
//释放空间
cudaFree(d_pImgInGPU);
cudaFree(d_pImgOutGPU);
return 0;
}
int main()
{
//读入图片
Mat ImgIn=imread("D:\\Images\\lenna1024.bmp",0);
//检查是否成功
if(ImgIn.empty())
{
cout<<"Can not load image"<<endl;
return -1;
}
//读取图片数据结构
Mat ImgOut=ImgIn.clone();
int nWidth=ImgIn.cols;
int nHeight=ImgIn.rows;
int nWidthStep=ImgIn.step;
int nChannels=ImgIn.channels();
uchar* pSrc=ImgIn.data;
uchar* pDest=ImgOut.data;
//记录时间事件
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);//开始时刻
//调用边缘检测函数处理图片
cudaImgEdge(pDest,pSrc,nWidth,nHeight,nWidthStep,nChannels);
cudaEventRecord(stop,0);//结束时刻
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to generate:%f ms\n",elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//显示图片
imshow("ImgIn",ImgIn);
imshow("ImgOut",ImgOut);
waitKey(0);
return 0;
}
结果一个1024X1024的图像运行时间为127ms
3 未使用纹理内存代码
//Sobel 边缘提取 未使用纹理内存
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<time.h>
#include"opencv2\opencv.hpp"
using namespace std;
using namespace cv;
//设备端kernel函数
__global__ void ImgEdge_3x3_Sobel_kernel(uchar* pImgOut,
uchar* pImgIn,
int nWidth,
int nHeight,
int nWidthStep)
{
const int ix=blockIdx.x*blockDim.x+threadIdx.x;
const int iy=blockIdx.y*blockDim.y+threadIdx.y;
const int ix_1=max(0,ix-1);
const int ix1=min(nWidth-1,ix+1);
const int iy_1=max(0,iy-1);
const int iy1=min(nHeight-1,iy+1);
if(ix<nWidth&&iy<nHeight)
{
uchar point00=pImgIn[iy_1*nWidthStep+ix_1];
uchar point01=pImgIn[iy_1*nWidthStep+ix];
uchar point02=pImgIn[iy_1*nWidthStep+ix1];
uchar point10=pImgIn[iy*nWidthStep+ix_1];
uchar point11=pImgIn[iy*nWidthStep+ix];
uchar point12=pImgIn[iy*nWidthStep+ix1];
uchar point20=pImgIn[iy1*nWidthStep+ix_1];
uchar point21=pImgIn[iy1*nWidthStep+ix];
uchar point22=pImgIn[iy*nWidthStep+ix1];
int Horz=point02+2*point12+point22-point00-2*point10-point20;
int Vert=point00+2*point01+point02-point20-2*point21-point22;
int Sum=abs(Horz)+abs(Vert);
if(Sum>255) pImgOut[iy*nWidthStep+ix]=255;
else pImgOut[iy*nWidthStep+ix]=(uchar)Sum;
}
}
//主机端函数
double cudaImgEdge(uchar* pImgOut,
uchar* pImgIn,
int nWidth,
int nHeight,
int nWidthStep,
int nChannels)
{
//准备设备端空间
uchar* d_pImgInGPU;
uchar* d_pImgOutGPU;
cudaMalloc((void**)&d_pImgInGPU,nWidthStep*nHeight*sizeof(uchar));
cudaMalloc((void**)&d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar));
//数据初始化
cudaMemcpy(d_pImgInGPU,pImgIn,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyHostToDevice);
cudaMemset(d_pImgOutGPU,0,nWidthStep*nHeight*sizeof(uchar));
//启动kernel进行并行处理
dim3 threads(16,16);
dim3 grid((nWidth*nChannels+threads.x-1)/threads.x,(nHeight+threads.y-1)/threads.y);
ImgEdge_3x3_Sobel_kernel<<<grid,threads>>>(d_pImgOutGPU,d_pImgInGPU,nWidth,nHeight,nWidthStep);
//数据输出
cudaMemcpy(pImgOut,d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyDeviceToHost);
//释放空间
cudaFree(d_pImgInGPU);
cudaFree(d_pImgOutGPU);
return 0;
}
int main()
{
//读入图片
Mat ImgIn=imread("D:\\Images\\lenna1024.bmp",0);
//检查是否成功
if(ImgIn.empty())
{
cout<<"Can not load image"<<endl;
return -1;
}
//读取图片数据结构
Mat ImgOut=ImgIn.clone();
int nWidth=ImgIn.cols;
int nHeight=ImgIn.rows;
int nWidthStep=ImgIn.step;
int nChannels=ImgIn.channels();
uchar* pSrc=ImgIn.data;
uchar* pDest=ImgOut.data;
//记录时间事件
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);//开始时刻
//调用边缘检测函数处理图片
cudaImgEdge(pDest,pSrc,nWidth,nHeight,nWidthStep,nChannels);
cudaEventRecord(stop,0);//结束时刻
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to generate:%f ms\n",elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//显示图片
imshow("ImgIn",ImgIn);
imshow("ImgOut",ImgOut);
waitKey(0);
return 0;
}
同样的图片时间为17ms.
不知道使用的对不对。。