配置教程网上一大堆,此处不说了。程序是网上下载的,
import pycuda.compiler as nvcc
import pycuda.gpuarray as gpu
import pycuda.driver as cu
import pycuda.autoinit
from sys import argv
from ws_utils import *
# Read and compile CUDA kernels.
print "Compiling CUDA kernels..."
kernel_source = open("kernels.cu").read()
main_module = nvcc.SourceModule(kernel_source)
descent_kernel = main_module.get_function("descent_kernel")
image_texture = main_module.get_texref("img")
plateau_kernel = main_module.get_function("plateau_kernel")
minima_kernel = main_module.get_function("minima_kernel")
flood_kernel = main_module.get_function("flood_kernel")
increment_kernel = main_module.get_function("increment_kernel")
# PyCUDA wrapper for watershed.
def watershed(I):
# Get contiguous image + shape.
height, width = I.shape
I = np.float32(I.copy())
# Get block/grid size for steps 1-3.
block_size = (6,6,1)
grid_size = (width/(block_size[0]-2),
height/(block_size[0]-2))
# Get block/grid size for step 4.
block_size2 = (16,16,1)
grid_size2 = (width/(block_size2[0]-2),
height/(block_size2[0]-2))
# Initialize variables.
labeled = np.zeros([height,width])
labeled = np.float32(labeled)
width = np.int32(width)
height = np.int32(height)
count = np.int32([0])
# Transfer labels asynchronously.
labeled_d = gpu.to_gpu_async(labeled)
counter_d = gpu.to_gpu_async(count)
# Bind CUDA textures.
I_cu = cu.matrix_to_array(I, order='C')
cu.bind_array_to_texref(I_cu, image_texture)
# Step 1.
descent_kernel(labeled_d, width,
height, block=block_size, grid=grid_size)
start_time = cu.Event()
end_time = cu.Event()
start_time.record()
# Step 2.
increment_kernel(labeled_d,width,height,
block=block_size2,grid=grid_size2)
counters_d = gpu.to_gpu(np.int32([0]))
old, new = -1, -2
while old != new:
old = new
minima_kernel(labeled_d, counters_d,
width, height, block=block_size, grid=grid_size)
new = counters_d.get()[0]
# Step 3.
counters_d = gpu.to_gpu(np.int32([0]))
old, new = -1, -2
while old != new:
old = new
plateau_kernel(labeled_d, counters_d, width,
height, block=block_size, grid=grid_size)
new = counters_d.get()[0]
# Step 4
counters_d = gpu.to_gpu(np.int32([0]))
old, new = -1, -2
while old != new:
old = new
flood_kernel(labeled_d, counters_d, width,
height, block=block_size2, grid=grid_size2)
new = counters_d.get()[0]
result = labeled_d.get()
# End GPU timers.
end_time.record()
end_time.synchronize()
gpu_time = start_time.\
time_till(end_time) * 1e-3
# print str(gpu_time)
return result
if __name__ == '__main__':
# Show the usage information.
if len(argv) != 2:
print "Usage: python ws_gpu.py test.dcm"
# Read in the DICOM image data.
O = read_dcm(argv[1])
# Preprocess the image.
I = preprocess(O)
# Get the watershed transform.
L = watershed(I)
# Show the final edges.
showEdges(L,O)
目前主要两个问题:
1、调用核函数时nvcc编译utf8格式问题:
main_module = nvcc.SourceModule(kernel_source)
stderr=stderr.decode("utf-8", "replace"))
pycuda.driver.CompileError: nvcc compilation of ....cu
解决,找到VS安装路径下/VC/tool/bin(类似这个路径,比如我的是E盘VS\VC\bin下)下有cl.exe的路径,添加到环境变量,然后将下段代码添加在程序开始处。
import os
if os.system("cl.exe"):
os.environ['PATH'] += ';'+r"E:\VS2015\VS\VC\bin"
if os.system("cl.exe"):
raise RuntimeError("cl.exe still not found, path probably incorrect")
就解决了这个报错。
2、在调用cu文件的具体函数时报错:
descent_kernel(labeled_d, width,height, block=block_size, grid=grid_size)
File "E:\anaconda\anaconda3.5.1\lib\site-packages\pycuda\driver.py", line 456, in function_call
func._launch_kernel(grid, block, arg_buf, shared, None)
TypeError: No registered converter was able to produce a C++ rvalue of type unsigned int from this Python object of type float
很少有解决的人,我只找到了一个日本人的解决办法:https://eigo.rumisunheart.com/2018/10/01/how-to-create-laplacian-filter-by-using-pycuda/ 需要fan qiang才能打开。
这个人本来是这样写的:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
import scipy.misc as scm
import matplotlib.pyplot as p
#realrow = np.random.random([20,20]).astype(np.float32)
#print realrow.shape
realrow = scm.imread('lenaG.jpg').astype(np.float32)
(M,N)=realrow.shape
print (realrow.shape)
mod_copy_texture=SourceModule(
"""
texture<float,2>tex;
__global__ void copy_texture_kernel(float *C,float * data)
{
int i = threadIdx.x+(blockIdx.x*(blockDim.x));
int j = threadIdx.y+(blockIdx.y*(blockDim.y));
int gx=0;
int gy=0;
int M=C[0];
int N=C1;
while(i<M)
{
while(j<N)
{
data[i*N+j] = 8*tex2D(tex,j,i)-tex2D(tex,j-1,i)-tex2D(tex,j-1,i-1)-tex2D(tex,j-1,i+1)-tex2D(tex,j+1,i)-tex2D(tex,j+1,i+1)-tex2D(tex,j+1,i-1)-tex2D(tex,j,i+1)-tex2D(tex,j,i-1);
__syncthreads();
j += blockDim.y * gridDim.y;
}
i += blockDim.x * gridDim.x;
}
}
""")
########
#get the kernel
########
copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
#########
#Map the Kernel to texture object
#########
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(realrow , texref , order = "C")
#texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
#texref.set_filter_mode()
gpu_output = np.zeros_like(realrow)
copy_texture_func(cuda.In(np.float32([M,N])),cuda.Out(gpu_output),block=(32,32, 1), grid=(M/32,N/32,1), texrefs=[texref])
p.gray()
p.subplot(1,2,1)
p.imshow(realrow)
p.subplot(1,2,2)
p.imshow(gpu_output)
p.show()
然后报了跟我一样的错,然后他修改:
copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
#########
#Map the Kernel to texture object
#########
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(realrow , texref , order = "C")
#texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
#texref.set_filter_mode()
gpu_output = np.zeros_like(realrow)
copy_texture_func(cuda.In(np.float32([M,N])),cuda.Out(gpu_output),\
block=(32,32, 1), grid=(M//32,N//32,1), texrefs=[texref])
p.rcParams['figure.figsize'] = 30, 30
plt.rcParams["font.size"] = "18"
p.gray()
p.subplot(1,2,1)
p.imshow(realrow)
p.subplot(1,2,2)
p.imshow(gpu_output)
p.show()
改成这样就OK了。其实就是grid=(M/32,N/32,1) → grid=(M//32,N//32,1) 我也试试看。果然是这个问题。
已跑出结果。