第一次写帖子,也想长期在这里写帖子,一起交流学习。
直接给代码
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
TILE_W = 16
TILE_H = 16
KERNEL_RADIUS = 1
KERNEL_W = 2 * KERNEL_RADIUS + 1
mod = SourceModule('''
__global__ void convolutionGPU(float *d_Result,float *d_Data,float *d_Kernel ,int dataW ,int dataH )
{
const int KERNEL_RADIUS=1;
const int KERNEL_W = 2 * KERNEL_RADIUS + 1;
__shared__ float sPartials[KERNEL_W*KERNEL_W];
int col = threadIdx.y + blockDim.y * blockIdx.y;
int row = threadIdx.x + blockDim.x * blockIdx.x;
int gLoc = row + dataW*col;
for(int i=0 ; i< KERNEL_W*KERNEL_W ; i+=1 )
sPartials[i]= d_Kernel[i];//d_Kernel[gLoc1] ;
float sum = 0;
float value = 0;
for(int i = -KERNEL_RADIUS; i<=KERNEL_RADIUS ; i++)
for(int j = -KERNEL_RADIUS; j<=KERNEL_RADIUS ;j++ ){
if( (col+j)<0 ||(row+i) < 0 ||(row+i) > (dataW-1) ||(col+j )>(dataH-1) )
value = 0;
else
value = d_Data[gLoc + i + j * dataH];
sum += value * sPartials[(i+KERNEL_RADIUS) + (j+KERNEL_RADIUS)*KERNEL_W];
}
d_Result[gLoc] = sum;
}
''')
convolutionGPU = mod.get_function("convolutionGPU")
def convolution_cuda(sourceImage,fil):
# Perform separable convolution on sourceImage using CUDA.
destImage = sourceImage.copy()
(imageHeight, imageWidth) = sourceImage.shape
fil = numpy.float32(fil)
DATA_H = imageHeight;
DATA_W = imageWidth
DATA_H = numpy.int32(DATA_H)
DATA_W = numpy.int32(DATA_W)
# Prepare device arrays
sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
fil_gpu = cuda.mem_alloc_like(fil)
destImage_gpu = cuda.mem_alloc_like(sourceImage)
cuda.memcpy_htod(sourceImage_gpu, sourceImage)
cuda.memcpy_htod(fil_gpu,fil)
print ('star')
convolutionGPU(destImage_gpu, sourceImage_gpu , fil_gpu, DATA_W, DATA_H , block=(5,1,1), grid=(1,5))
# Pull the data back from the GPU.
cuda.memcpy_dtoh(destImage, destImage_gpu)
return destImage
def test_convolution_cuda():
# Test the convolution kernel.
# Generate or load a test image
original = numpy.array(
[[1,1,1,0,0],
[0,1,1,1,0],
[0,0,1,1,1],
[0,0,1,1,0],
[0,1,1,0,0]
])
# original = numpy.random.rand(9,9)
original = numpy.float32(original)
print (original)
# You probably want to display the image using the tool of your choice here.
fil = numpy.array([[1,0,1],[0,1,0],[1,0,1]
])
# fil = numpy.array([[0,0,0],[0,1,0],[0,0,0]
# ])
destImage = original.copy()
destImage[:] = numpy.nan
destImage = convolution_cuda(original, fil)
# You probably wand to display the result image using the tool of your choice here.
print ('Done running the convolution kernel!')
print ( destImage)
if __name__ == '__main__':
test_convolution_cuda()
本程序的算法采用的是离散型卷积通过求和实现的,没有用卷积定理。(下一步准备用卷积定理试一试,应该会更快)说说本程序值得注意的细节,卷积核的矩阵存入到 shard memory里面应为核矩阵数据需要多次的重复使用所以非常适合放到共享内存。本人,在分配线程和线程块的时候出了大问题,刚开始分配了过多的线程块过少的线程导致程序的速度运行低于CPU下的库的卷积函数的速度。后来调用了每个块下的最大线程1024个,这才加速成功了。本次程序在GPU下可以提速8倍