CUDA 图像处理使用纹理内存与不使用纹理内存对比

1 最近在看CUDA优化,看到纹理内存部分,可以使用纹理内存加速。
可是使用纹理内存后图像的处理速度反而变慢了。不解????

2 使用纹理内存代码
//Sobel 边缘提取 使用纹理内存

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<iostream>
#include"opencv2\opencv.hpp"

using namespace std;
using namespace cv;

//声明纹理参考系
texture<uchar,2,cudaReadModeElementType> tex;//2d texture

//设备端kernel函数
__global__ void ImgEdge_3x3_S_kernel(uchar* pImgOut,
                                       int nWidth,
                                       int nHeight,
                                       int nWidthStep)
{
    const int ix=blockIdx.x*blockDim.x+threadIdx.x;
    const int iy=blockIdx.y*blockDim.y+threadIdx.y;
    int point=ix+iy*nWidthStep;


    if(ix<nWidth&&iy<nHeight)
    {

        uchar point00=tex2D(tex,(float)ix-1,(float)iy-1);
        uchar point01=tex2D(tex,(float)ix,(float)iy-1);
        uchar point02=tex2D(tex,(float)ix+1,(float)iy-1);
        uchar point10=tex2D(tex,(float)ix-1,(float)iy);
        uchar point11=tex2D(tex,(float)ix,(float)iy);
        uchar point12=tex2D(tex,(float)ix+1,(float)iy);
        uchar point20=tex2D(tex,(float)ix-1,(float)iy+1);
        uchar point21=tex2D(tex,(float)ix,(float)iy+1);
        uchar point22=tex2D(tex,(float)ix+1,(float)iy+1);
        int Horz=point02+2*point12+point22-point00-2*point10-point20;
        int Vert=point00+2*point01+point02-point20-2*point21-point22;
        int Sum=abs(Horz)+abs(Vert);
        if(Sum>255) pImgOut[point]=255;
        else pImgOut[point]=(uchar)Sum;

    }
}
//设备端代码
double cudaImgEdge(uchar* pImgOut,
                     uchar* pImgIn,
                     int nWidth,
                     int nHeight,
                     int nWidthStep,
                     int nChannels)
{

    //准备设备端空间
    uchar* d_pImgInGPU;
    uchar* d_pImgOutGPU;
    cudaMalloc((void**)&d_pImgInGPU,nWidthStep*nHeight*sizeof(uchar));
    cudaMalloc((void**)&d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar));
    //数据初始化
    cudaMemcpy(d_pImgInGPU,pImgIn,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyHostToDevice);
    cudaMemset(d_pImgOutGPU,0,nWidthStep*nHeight*sizeof(uchar));
    //建立CUDA二维数组
    cudaArray* cuArray;
    cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc<uchar>();//CUDA数组的描述(组件数量和数据类型)
    cudaMallocArray(&cuArray,&channelDesc,nWidth,nHeight);
    cudaMemcpyToArray(cuArray,0,0,pImgIn,sizeof(uchar)*nWidth*nHeight,cudaMemcpyHostToDevice);
    //tex.AddressMode[0]=cudaAddressModeWrap;
    //tex.AddressMode[1]=cudaAddressModeWrap;
    //tex.normalized=true;
    //将显存数据与纹理绑定
    cudaBindTextureToArray(&tex,cuArray,&channelDesc);

    //启动kernel进行并行处理
    dim3 threads(16,16);
    dim3 grid((nWidth*nChannels+threads.x-1)/threads.x,(nHeight+threads.y-1)/threads.y);
    ImgEdge_3x3_S_kernel<<<grid,threads>>>(d_pImgOutGPU,nWidth,nHeight,nWidthStep);
    //解绑定
    cudaUnbindTexture(tex);
    cudaFreeArray(cuArray);
    //数据输出
    cudaMemcpy(pImgOut,d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyDeviceToHost);
    //释放空间
    cudaFree(d_pImgInGPU);
    cudaFree(d_pImgOutGPU);
    return 0;
}

int main()
{
    //读入图片
    Mat ImgIn=imread("D:\\Images\\lenna1024.bmp",0);
    //检查是否成功
    if(ImgIn.empty())
    {
        cout<<"Can not load image"<<endl;
        return -1;
    }
    //读取图片数据结构
    Mat ImgOut=ImgIn.clone();
    int nWidth=ImgIn.cols;
    int nHeight=ImgIn.rows;
    int nWidthStep=ImgIn.step;
    int nChannels=ImgIn.channels();
    uchar* pSrc=ImgIn.data;
    uchar* pDest=ImgOut.data;

    //记录时间事件
    cudaEvent_t start,stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0);//开始时刻

    //调用边缘检测函数处理图片
    cudaImgEdge(pDest,pSrc,nWidth,nHeight,nWidthStep,nChannels);

    cudaEventRecord(stop,0);//结束时刻
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime,start,stop);
    printf("Time to generate:%f ms\n",elapsedTime);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    //显示图片
    imshow("ImgIn",ImgIn);
    imshow("ImgOut",ImgOut);
    waitKey(0);
    return 0;
}

结果一个1024X1024的图像运行时间为127ms

3 未使用纹理内存代码

//Sobel 边缘提取 未使用纹理内存

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<time.h>
#include"opencv2\opencv.hpp"

using namespace std;
using namespace cv;


//设备端kernel函数
__global__ void ImgEdge_3x3_Sobel_kernel(uchar* pImgOut,
                                       uchar* pImgIn,
                                       int nWidth,
                                       int nHeight,
                                       int nWidthStep)
{
    const int ix=blockIdx.x*blockDim.x+threadIdx.x;
    const int iy=blockIdx.y*blockDim.y+threadIdx.y;

    const int ix_1=max(0,ix-1);
    const int ix1=min(nWidth-1,ix+1);
    const int iy_1=max(0,iy-1);
    const int iy1=min(nHeight-1,iy+1);

    if(ix<nWidth&&iy<nHeight)
    {

        uchar point00=pImgIn[iy_1*nWidthStep+ix_1];
        uchar point01=pImgIn[iy_1*nWidthStep+ix];
        uchar point02=pImgIn[iy_1*nWidthStep+ix1];
        uchar point10=pImgIn[iy*nWidthStep+ix_1];
        uchar point11=pImgIn[iy*nWidthStep+ix];
        uchar point12=pImgIn[iy*nWidthStep+ix1];
        uchar point20=pImgIn[iy1*nWidthStep+ix_1];
        uchar point21=pImgIn[iy1*nWidthStep+ix];
        uchar point22=pImgIn[iy*nWidthStep+ix1];
        int Horz=point02+2*point12+point22-point00-2*point10-point20;
        int Vert=point00+2*point01+point02-point20-2*point21-point22;
        int Sum=abs(Horz)+abs(Vert);
        if(Sum>255) pImgOut[iy*nWidthStep+ix]=255;
        else pImgOut[iy*nWidthStep+ix]=(uchar)Sum;

    }
}
//主机端函数
double cudaImgEdge(uchar* pImgOut,
                     uchar* pImgIn,
                     int nWidth,
                     int nHeight,
                     int nWidthStep,
                     int nChannels)
{

    //准备设备端空间
    uchar* d_pImgInGPU;
    uchar* d_pImgOutGPU;
    cudaMalloc((void**)&d_pImgInGPU,nWidthStep*nHeight*sizeof(uchar));
    cudaMalloc((void**)&d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar));
    //数据初始化
    cudaMemcpy(d_pImgInGPU,pImgIn,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyHostToDevice);
    cudaMemset(d_pImgOutGPU,0,nWidthStep*nHeight*sizeof(uchar));


    //启动kernel进行并行处理
    dim3 threads(16,16);
    dim3 grid((nWidth*nChannels+threads.x-1)/threads.x,(nHeight+threads.y-1)/threads.y);
    ImgEdge_3x3_Sobel_kernel<<<grid,threads>>>(d_pImgOutGPU,d_pImgInGPU,nWidth,nHeight,nWidthStep);


    //数据输出
    cudaMemcpy(pImgOut,d_pImgOutGPU,nWidthStep*nHeight*sizeof(uchar),cudaMemcpyDeviceToHost);


    //释放空间
    cudaFree(d_pImgInGPU);
    cudaFree(d_pImgOutGPU);
    return 0;
}

int main()
{

    //读入图片
    Mat ImgIn=imread("D:\\Images\\lenna1024.bmp",0);
    //检查是否成功
    if(ImgIn.empty())
    {
        cout<<"Can not load image"<<endl;
        return -1;
    }
    //读取图片数据结构
    Mat ImgOut=ImgIn.clone();
    int nWidth=ImgIn.cols;
    int nHeight=ImgIn.rows;
    int nWidthStep=ImgIn.step;
    int nChannels=ImgIn.channels();
    uchar* pSrc=ImgIn.data;
    uchar* pDest=ImgOut.data;

    //记录时间事件
    cudaEvent_t start,stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0);//开始时刻

    //调用边缘检测函数处理图片
    cudaImgEdge(pDest,pSrc,nWidth,nHeight,nWidthStep,nChannels);

    cudaEventRecord(stop,0);//结束时刻
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime,start,stop);
    printf("Time to generate:%f ms\n",elapsedTime);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    //显示图片
    imshow("ImgIn",ImgIn);
    imshow("ImgOut",ImgOut);

    waitKey(0);
    return 0;
}

同样的图片时间为17ms.

不知道使用的对不对。。

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值