CUDA优化实例(五)纹理内存与常量内存
做了个卷积核的实验:
最基本情况:
结果:
使用常量内存:
纹理内存:
结果分析
没有使用texture memory 核函数花费80us,使用了纹理内存的,不管是线性内存还是CUDA Array都是140us
实例
源码:
CMakeLists.txt:
# CMakeLists.txt for G4CU project
project(test_cuda_project)
# required cmake version
cmake_minimum_required(VERSION 2.8)
# packages
find_package(CUDA)
# nvcc flags
set(CUDA_NVCC_FLAGS -arch=compute_60)
#set(CUDA_NVCC_FLAGS -gencode arch=compute_52,code=sm_52;-G;-g)
file(GLOB_RECURSE CURRENT_HEADERS *.h *.hpp *.cuh)
file(GLOB CURRENT_SOURCES *.cpp *.cu)
source_group("Include" FILES ${CURRENT_HEADERS})
source_group("Source" FILES ${CURRENT_SOURCES})
# Find OpenCV, you may need to set OpenCV_DIR variable
# to the absolute path to the directory containing OpenCVConfig.cmake file
# via the command line or GUI
set(OpenCV_DIR /home/jie/third_party/opencv-3.4.0/build)
find_package(OpenCV REQUIRED)
# If the package has been found, several variables will
# be set, you can find the full list with descriptions
# in the OpenCVConfig.cmake file.
# Print some message showing some of them
message(STATUS "OpenCV library status:")
message(STATUS " version: ${OpenCV_VERSION}")
message(STATUS " libraries: ${OpenCV_LIBS}")
message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
# Add OpenCV headers location to your include paths
include_directories(${OpenCV_INCLUDE_DIRS})
#CUDA_ADD_EXECUTABLE(test_cuda_project ${CURRENT_HEADERS} ${CURRENT_SOURCES})
CUDA_ADD_EXECUTABLE(test_cuda_project convolution.cu)
# Link your application with OpenCV libraries
target_link_libraries(test_cuda_project ${OpenCV_LIBS})
CUDA_ADD_EXECUTABLE(tex tex.cu)
# Link your application with OpenCV libraries
target_link_libraries(tex ${OpenCV_LIBS})
CUDA_ADD_EXECUTABLE(qq texture.cu)
# Link your application with OpenCV libraries
target_link_libraries(qq ${OpenCV_LIBS})
texture.cu:
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <time.h>
#include <math.h>
using namespace cv;
#define DEGRE_TO_RADIAN(x) ((x) * acos(-1) / 180)
#define CEIL(x,y) ((((x) + (y) - 1))/ (y) )
#define CHECK(res) { if(res != cudaSuccess){printf("Error :%s:%d , ", __FILE__,__LINE__); \
printf("code : %d , reason : %s \n", res,cudaGetErrorString(res));exit(-1);}}
//texture<uchar, cudaTextureType2D, cudaReadModeElementType> texRef;
texture<uchar, 1, cudaReadModeElementType> texRef;
texture<uchar, 2, cudaReadModeElementType> texRef2D;
__constant__ int dev_H[9];
__global__ void kernel(uchar* dout_data,uchar *d_data,int width,int height)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int coreIndex = y*width+x;
if (x > 0 && x < width - 1 && y >0 && y < height - 1)
{
// dout_data[coreIndex] = (
// d_data[coreIndex - 1] * 2 + d_data[coreIndex] *4 + d_data[coreIndex+1]*2+
// d_data[coreIndex - width - 1] + d_data[coreIndex - width ]*2 + d_data[coreIndex - width + 1]+
// d_data[coreIndex + width - 1] + d_data[coreIndex + width] *2 + d_data[coreIndex+ width + 1] )/16 ;
dout_data[coreIndex] =(
d_data[coreIndex - width - 1]* 1 + d_data[coreIndex - width] * 2 + d_data[coreIndex - width + 1] * 1+
d_data[coreIndex - 1] * 2 + d_data[coreIndex ] * 4 + d_data[coreIndex+1 ] * 2+
d_data[coreIndex + width - 1]* 1 + d_data[coreIndex + width] * 2 + d_data[coreIndex + width + 1] * 1
)/16;
}
}
__global__ void kernelUseCon(uchar* dout_data,uchar *d_data,int width,int height)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int coreIndex = y*width+x;
if (x > 0 && x < width - 1 && y >0 && y < height - 1)
{
dout_data[coreIndex] =(
d_data[coreIndex - width - 1]* dev_H[0] + d_data[coreIndex - width] * dev_H[1] + d_data[coreIndex - width + 1] * dev_H[2]+
d_data[coreIndex - 1] * dev_H[3] + d_data[coreIndex ] * dev_H[4] + d_data[coreIndex+1 ] * dev_H[5]+
d_data[coreIndex + width - 1]* dev_H[6] + d_data[coreIndex + width] * dev_H[7] + d_data[coreIndex + width + 1] * dev_H[8]
)/16;
}
}
__global__ void kernelUseTex(uchar* dout_data,int width,int height)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int coreIndex = y*width+x;
if (x > 0 && x < width - 1 && y >0 && y < height - 1)
{
dout_data[coreIndex] =(
tex1Dfetch(texRef,coreIndex - width - 1)* 1 + tex1Dfetch(texRef,coreIndex - width) * 2 + tex1Dfetch(texRef,coreIndex - width + 1) * 1+
tex1Dfetch(texRef,coreIndex - 1) * 2 + tex1Dfetch(texRef,coreIndex ) * 4 + tex1Dfetch(texRef,coreIndex+1 ) * 2+
tex1Dfetch(texRef,coreIndex + width - 1)* 1 + tex1Dfetch(texRef,coreIndex + width) * 2 + tex1Dfetch(texRef,coreIndex + width + 1) * 1
)/16;
}
}
__global__ void kernelUseTex2D(uchar* dout_data,int width,int height)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int coreIndex = y*width+x;
if (x > 0 && x < width - 1 && y >0 && y < height - 1)
{
dout_data[coreIndex] =(
tex2D(texRef2D,x-1,y-1)* 1 + tex2D(texRef2D,x,y-1) * 2 + tex2D(texRef2D,x+1,y-1) * 1+
tex2D(texRef2D,x-1,y) * 2 + tex2D(texRef2D,x,y) * 4 + tex2D(texRef2D,x+1,y) * 2+
tex2D(texRef2D,x-1,y+1)* 1 + tex2D(texRef2D,x,y+1) * 2 + tex2D(texRef2D,x+1,y+1) * 1
)/16;
}
}
int main()
{
Mat src = imread("../road.png",CV_LOAD_IMAGE_GRAYSCALE);
Mat dst;
dst.create(src.size(), src.type());
int height = src.size().height;
int width = src.size().width;
int size = width*height*sizeof(uchar);
uchar *d_data,*dout_data;
if (src.empty())
{
fprintf(stderr, "Can not load image %s\n", "road.png");
return -1;
}
/* common */
CHECK(cudaMalloc((void **)&d_data, size));
CHECK(cudaMalloc((void **)&dout_data, size));
CHECK(cudaMemcpy(d_data, src.data, size, cudaMemcpyHostToDevice));
dim3 dimBlock(32, 4);
dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));
kernel<<<dimGrid,dimBlock>>>(dout_data,d_data,width,height);
CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
/* common */
/* use constance */
// int H[9];
// H[0]=1;H[1]=2;H[2]=1;
// H[3]=2;H[4]=4;H[5]=2;
// H[6]=1;H[7]=2;H[8]=1;
// cudaMemcpyToSymbol(dev_H,H,sizeof(int)*9);
// CHECK(cudaMalloc((void **)&d_data, size));
// CHECK(cudaMalloc((void **)&dout_data, size));
// CHECK(cudaMemcpy(d_data, src.data, size, cudaMemcpyHostToDevice));
// dim3 dimBlock(32, 4);
// dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));
// kernelUseCon<<<dimGrid,dimBlock>>>(dout_data,d_data,width,height);
// CHECK(cudaDeviceSynchronize());
// CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
/* use constance */
/* use texture Linar 1D */
// CHECK(cudaMalloc((void **)&d_data, size));
// CHECK(cudaMalloc((void **)&dout_data, size));
// CHECK(cudaMemcpy(d_data, src.data, size, cudaMemcpyHostToDevice));
// CHECK(cudaBindTexture(NULL,texRef,d_data,size));
// dim3 dimBlock(32, 4);
// dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));
// kernelUseTex<<<dimGrid,dimBlock>>>(dout_data,width,height);
// CHECK(cudaDeviceSynchronize());
// CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
/* use texture Linar 1D */
/* use texture CUDA Array 2D */
// CHECK(cudaMalloc((void **)&dout_data, size));
// cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
// cudaArray* cuArray;
// CHECK(cudaMallocArray(&cuArray, &channelDesc, width, height));
// CHECK(cudaMemcpyToArray(cuArray,0,0,src.data,size,cudaMemcpyHostToDevice));
// CHECK(cudaBindTextureToArray(texRef2D,cuArray,channelDesc));
// dim3 dimBlock(32, 4);
// dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));
// kernelUseTex2D<<<dimGrid,dimBlock>>>(dout_data,width,height);
// CHECK(cudaDeviceSynchronize());
// CHECK(cudaMemcpy(dst.data, dout_data, size, cudaMemcpyDeviceToHost));
/* use texture CUDA Array 2D */
imshow("Linear Blend",dst);
waitKey(0);
return 0;
}
分析:
常量内存不怎么影响程序的性能,但它可以减少本地内存的使用,有利于占用率的提升。
纹理内存没有增加系统的性能,反而降级,原因可能是此例的内存访问,在线程束中是合并的,且独立的内存请求较多(类似展开)。