CUDA优化实例(五)纹理内存与常量内存

CUDA优化实例(五)纹理内存与常量内存

做了个卷积核的实验:
最基本情况:

结果:

这里写图片描述


使用常量内存:
这里写图片描述


纹理内存:
这里写图片描述


结果分析

没有使用texture memory 核函数花费80us,使用了纹理内存的,不管是线性内存还是CUDA Array都是140us

实例

源码:
CMakeLists.txt:

# CMakeLists.txt for G4CU project
project(test_cuda_project)
# required cmake version
cmake_minimum_required(VERSION 2.8)
# packages
find_package(CUDA)
# nvcc flags
set(CUDA_NVCC_FLAGS -arch=compute_60)
#set(CUDA_NVCC_FLAGS -gencode arch=compute_52,code=sm_52;-G;-g)

file(GLOB_RECURSE CURRENT_HEADERS  *.h *.hpp *.cuh)
file(GLOB CURRENT_SOURCES  *.cpp *.cu)
source_group("Include" FILES ${CURRENT_HEADERS}) 
source_group("Source" FILES ${CURRENT_SOURCES}) 



# Find OpenCV, you may need to set OpenCV_DIR variable
# to the absolute path to the directory containing OpenCVConfig.cmake file
# via the command line or GUI

set(OpenCV_DIR /home/jie/third_party/opencv-3.4.0/build)

find_package(OpenCV REQUIRED)



# If the package has been found, several variables will
# be set, you can find the full list with descriptions
# in the OpenCVConfig.cmake file.
# Print some message showing some of them
message(STATUS "OpenCV library status:")
message(STATUS "    version: ${OpenCV_VERSION}")
message(STATUS "    libraries: ${OpenCV_LIBS}")
message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")

# Add OpenCV headers location to your include paths
include_directories(${OpenCV_INCLUDE_DIRS})


#CUDA_ADD_EXECUTABLE(test_cuda_project ${CURRENT_HEADERS} ${CURRENT_SOURCES})
CUDA_ADD_EXECUTABLE(test_cuda_project convolution.cu)


# Link your application with OpenCV libraries
target_link_libraries(test_cuda_project ${OpenCV_LIBS})

CUDA_ADD_EXECUTABLE(tex tex.cu)
# Link your application with OpenCV libraries
target_link_libraries(tex ${OpenCV_LIBS})

CUDA_ADD_EXECUTABLE(qq texture.cu)
# Link your application with OpenCV libraries
target_link_libraries(qq ${OpenCV_LIBS})

texture.cu:

#include <stdio.h>  
#include <iostream>
#include <stdlib.h>  
#include <cuda_runtime.h>  
#include "device_launch_parameters.h"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <time.h>
#include <math.h>

using namespace cv;


#define DEGRE_TO_RADIAN(x) ((x) * acos(-1) / 180)
#define CEIL(x,y) ((((x) + (y) - 1))/ (y) )
#define CHECK(res) { if(res != cudaSuccess){printf("Error :%s:%d , ", __FILE__,__LINE__);   \
printf("code : %d , reason : %s \n", res,cudaGetErrorString(res));exit(-1);}}

//texture<uchar, cudaTextureType2D, cudaReadModeElementType> texRef;
texture<uchar, 1, cudaReadModeElementType> texRef;

texture<uchar, 2, cudaReadModeElementType> texRef2D;

__constant__ int dev_H[9];

__global__ void kernel(uchar* dout_data,uchar *d_data,int width,int height)
{
    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;
    int coreIndex = y*width+x;

    if (x > 0 && x < width - 1 && y >0 && y < height - 1)
    {
        // dout_data[coreIndex] = (
        //  d_data[coreIndex - 1] * 2  + d_data[coreIndex] *4 + d_data[coreIndex+1]*2+
        //  d_data[coreIndex - width - 1] + d_data[coreIndex - width ]*2 + d_data[coreIndex - width + 1]+
        //  d_data[coreIndex + width - 1] + d_data[coreIndex + width] *2 + d_data[coreIndex+ width + 1] )/16 ;


        dout_data[coreIndex] =(
            d_data[coreIndex - width - 1]* 1  + d_data[coreIndex - width] *  2 + d_data[coreIndex - width + 1] * 1+
            d_data[coreIndex - 1] *        2  + d_data[coreIndex        ] *  4   + d_data[coreIndex+1        ] * 2+
            d_data[coreIndex + width - 1]* 1  + d_data[coreIndex + width] *  2 + d_data[coreIndex + width + 1] * 1 
        )/16;

    }

}
__global__ void kernelUseCon(uchar* dout_data,uchar *d_data,int width,int height)
{
    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;
    int coreIndex = y*width+x;

    if (x > 0 && x < width - 1 && y >0 && y < height - 1)
    {           
        dout_data[coreIndex] =(
            d_data[coreIndex - width - 1]* dev_H[0]  + d_data[coreIndex - width] *  dev_H[1] + d_data[coreIndex - width + 1] * dev_H[2]+
            d_data[coreIndex - 1] *        dev_H[3]  + d_data[coreIndex        ] *  dev_H[4]   + d_data[coreIndex+1        ] * dev_H[5]+
            d_data[coreIndex + width - 1]* dev_H[6]  + d_data[coreIndex + width] *  dev_H[7] + d_data[coreIndex + width + 1] * dev_H[8] 
        )/16;

    }

}
__global__ void kernelUseTex(uchar* dout_data,int width,int height)
{
    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;
    int coreIndex = y*width+x;

    if (x > 0 && x < width - 1 && y >0 && y < height - 1)
    {       
        dout_data[coreIndex] =(
            tex1Dfetch(texRef,coreIndex - width - 1)* 1  + tex1Dfetch(texRef,coreIndex - width) *  2 + tex1Dfetch(texRef,coreIndex - width + 1) * 1+
            tex1Dfetch(texRef,coreIndex - 1) *        2  + tex1Dfetch(texRef,coreIndex        ) *  4 + tex1Dfetch(texRef,coreIndex+1          ) * 2+
            tex1Dfetch(texRef,coreIndex + width - 1)* 1  + tex1Dfetch(texRef,coreIndex + width) *  2 + tex1Dfetch(texRef,coreIndex + width + 1) * 1 
        )/16;

    }

}
__global__ void kernelUseTex2D(uchar* dout_data,int width,int height)
{
    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;
    int coreIndex = y*width+x;

    if (x > 0 && x < width - 1 && y >0 && y < height - 1)
    {       
        dout_data[coreIndex] =(
            tex2D(texRef2D,x-1,y-1)* 1  + tex2D(texRef2D,x,y-1) *  2 + tex2D(texRef2D,x+1,y-1) * 1+
            tex2D(texRef2D,x-1,y)  * 2  + tex2D(texRef2D,x,y)   *  4 + tex2D(texRef2D,x+1,y)   * 2+
            tex2D(texRef2D,x-1,y+1)* 1  + tex2D(texRef2D,x,y+1) *  2 + tex2D(texRef2D,x+1,y+1) * 1 
        )/16;

    }

}
int main()
{
    Mat src = imread("../road.png",CV_LOAD_IMAGE_GRAYSCALE);
    Mat dst;
    dst.create(src.size(), src.type());
    int height = src.size().height;
    int width = src.size().width;
    int size = width*height*sizeof(uchar);
    uchar *d_data,*dout_data;

    if (src.empty())
    {
        fprintf(stderr, "Can not load image %s\n", "road.png");
        return -1;
    }

    /*  common */
    CHECK(cudaMalloc((void **)&d_data, size));
    CHECK(cudaMalloc((void **)&dout_data, size));
    CHECK(cudaMemcpy(d_data, src.data, size, cudaMemcpyHostToDevice));

    dim3 dimBlock(32, 4);
    dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));

    kernel<<<dimGrid,dimBlock>>>(dout_data,d_data,width,height);
    CHECK(cudaDeviceSynchronize());


    CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
    /*  common */

    /*  use constance  */
    // int H[9];
    // H[0]=1;H[1]=2;H[2]=1;
    // H[3]=2;H[4]=4;H[5]=2;
    // H[6]=1;H[7]=2;H[8]=1;

    // cudaMemcpyToSymbol(dev_H,H,sizeof(int)*9);

    // CHECK(cudaMalloc((void **)&d_data, size));
    // CHECK(cudaMalloc((void **)&dout_data, size));
    // CHECK(cudaMemcpy(d_data, src.data, size, cudaMemcpyHostToDevice));

    // dim3 dimBlock(32, 4);
    // dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));

    // kernelUseCon<<<dimGrid,dimBlock>>>(dout_data,d_data,width,height);
    // CHECK(cudaDeviceSynchronize());


    // CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
    /*  use constance  */


    /*  use texture Linar 1D */
    // CHECK(cudaMalloc((void **)&d_data, size));
    // CHECK(cudaMalloc((void **)&dout_data, size));
    // CHECK(cudaMemcpy(d_data, src.data, size, cudaMemcpyHostToDevice));

    // CHECK(cudaBindTexture(NULL,texRef,d_data,size));

    // dim3 dimBlock(32, 4);
    // dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));

    // kernelUseTex<<<dimGrid,dimBlock>>>(dout_data,width,height);
    // CHECK(cudaDeviceSynchronize());


    // CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
    /*  use texture Linar 1D */

    /*  use texture CUDA Array 2D */
    // CHECK(cudaMalloc((void **)&dout_data, size));

    // cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
    // cudaArray* cuArray;
    // CHECK(cudaMallocArray(&cuArray, &channelDesc, width, height));
    // CHECK(cudaMemcpyToArray(cuArray,0,0,src.data,size,cudaMemcpyHostToDevice));


    // CHECK(cudaBindTextureToArray(texRef2D,cuArray,channelDesc));

    // dim3 dimBlock(32, 4);
    // dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));

    // kernelUseTex2D<<<dimGrid,dimBlock>>>(dout_data,width,height);
    // CHECK(cudaDeviceSynchronize());


    // CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
    /*  use texture CUDA Array 2D */




    imshow("Linear Blend",dst);
    waitKey(0);



    return 0;
}

分析:

常量内存不怎么影响程序的性能,但它可以减少本地内存的使用,有利于占用率的提升。
纹理内存没有增加系统的性能,反而降级,原因可能是此例的内存访问,在线程束中是合并的,且独立的内存请求较多(类似展开)。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值