python调用cuda核函数问题记录

最新推荐文章于 2023-07-11 02:54:50 发布

元气少女缘结神

最新推荐文章于 2023-07-11 02:54:50 发布

阅读量2.3k

点赞数 2

分类专栏： Improve Performance 基础语言

本文链接：https://blog.csdn.net/wd1603926823/article/details/108482225

版权

Improve Performance 同时被 2 个专栏收录

39 篇文章 9 订阅

订阅专栏

基础语言

28 篇文章 0 订阅

订阅专栏

配置教程网上一大堆，此处不说了。程序是网上下载的，

import pycuda.compiler as nvcc
import pycuda.gpuarray as gpu
import pycuda.driver as cu
import pycuda.autoinit
from sys import argv
from ws_utils import *

# Read and compile CUDA kernels.
print "Compiling CUDA kernels..."
kernel_source = open("kernels.cu").read()
main_module = nvcc.SourceModule(kernel_source)
descent_kernel = main_module.get_function("descent_kernel")
image_texture = main_module.get_texref("img")
plateau_kernel = main_module.get_function("plateau_kernel")
minima_kernel = main_module.get_function("minima_kernel")
flood_kernel = main_module.get_function("flood_kernel")
increment_kernel = main_module.get_function("increment_kernel")

# PyCUDA wrapper for watershed.
def watershed(I):

  # Get contiguous image + shape.
  height, width = I.shape
  I = np.float32(I.copy())

  # Get block/grid size for steps 1-3.
  block_size =  (6,6,1)
  grid_size =   (width/(block_size[0]-2),
                height/(block_size[0]-2))

  # Get block/grid size for step 4.
  block_size2 = (16,16,1)
  grid_size2  = (width/(block_size2[0]-2),
                height/(block_size2[0]-2))

  # Initialize variables.
  labeled       = np.zeros([height,width]) 
  labeled       = np.float32(labeled)
  width         = np.int32(width)
  height        = np.int32(height)
  count         = np.int32([0])

  # Transfer labels asynchronously.
  labeled_d = gpu.to_gpu_async(labeled)
  counter_d = gpu.to_gpu_async(count)

  # Bind CUDA textures.
  I_cu = cu.matrix_to_array(I, order='C')
  cu.bind_array_to_texref(I_cu, image_texture)

  # Step 1.
  descent_kernel(labeled_d, width, 
  height, block=block_size, grid=grid_size)
  
  start_time = cu.Event()
  end_time = cu.Event()
  start_time.record()

  # Step 2.
  increment_kernel(labeled_d,width,height,
  block=block_size2,grid=grid_size2)
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2

  while old != new:
    old = new
    minima_kernel(labeled_d, counters_d,
    width, height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]

  # Step 3.
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    plateau_kernel(labeled_d, counters_d, width,
    height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]
  
  # Step 4
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    flood_kernel(labeled_d, counters_d, width,
    height, block=block_size2, grid=grid_size2)
    new = counters_d.get()[0]

  result = labeled_d.get()
  
  # End GPU timers.
  end_time.record()
  end_time.synchronize()
  gpu_time = start_time.\
  time_till(end_time) * 1e-3

  # print str(gpu_time)

  return result

if __name__ == '__main__':
  # Show the usage information.
  if len(argv) != 2:
    print "Usage: python ws_gpu.py test.dcm"
  # Read in the DICOM image data.
  O = read_dcm(argv[1])
  # Preprocess the image.
  I = preprocess(O)
  # Get the watershed transform.
  L = watershed(I)
  # Show the final edges.
  showEdges(L,O)

目前主要两个问题：

1、调用核函数时nvcc编译utf8格式问题：

main_module = nvcc.SourceModule(kernel_source)
stderr=stderr.decode("utf-8", "replace"))
pycuda.driver.CompileError: nvcc compilation of ....cu

解决，找到VS安装路径下/VC/tool/bin（类似这个路径，比如我的是E盘VS\VC\bin下）下有cl.exe的路径，添加到环境变量，然后将下段代码添加在程序开始处。

import os
if os.system("cl.exe"):
    os.environ['PATH'] += ';'+r"E:\VS2015\VS\VC\bin"
if os.system("cl.exe"):
    raise RuntimeError("cl.exe still not found, path probably incorrect")

就解决了这个报错。

2、在调用cu文件的具体函数时报错：

descent_kernel(labeled_d, width,height, block=block_size, grid=grid_size)

  File "E:\anaconda\anaconda3.5.1\lib\site-packages\pycuda\driver.py", line 456, in function_call
    func._launch_kernel(grid, block, arg_buf, shared, None)

TypeError: No registered converter was able to produce a C++ rvalue of type unsigned int from this Python object of type float

很少有解决的人，我只找到了一个日本人的解决办法：https://eigo.rumisunheart.com/2018/10/01/how-to-create-laplacian-filter-by-using-pycuda/ 需要fan qiang才能打开。

这个人本来是这样写的：

import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
import scipy.misc as scm
import matplotlib.pyplot as p

#realrow = np.random.random([20,20]).astype(np.float32)
#print realrow.shape

realrow = scm.imread('lenaG.jpg').astype(np.float32)

(M,N)=realrow.shape
print (realrow.shape)

mod_copy_texture=SourceModule(
"""
texture<float,2>tex;
__global__ void  copy_texture_kernel(float *C,float * data)
 {
  int i = threadIdx.x+(blockIdx.x*(blockDim.x));
  int j = threadIdx.y+(blockIdx.y*(blockDim.y));
  int gx=0;
  int gy=0;
  int M=C[0];
  int N=C1;
   while(i<M)
  {
  while(j<N)
  {
  data[i*N+j] = 8*tex2D(tex,j,i)-tex2D(tex,j-1,i)-tex2D(tex,j-1,i-1)-tex2D(tex,j-1,i+1)-tex2D(tex,j+1,i)-tex2D(tex,j+1,i+1)-tex2D(tex,j+1,i-1)-tex2D(tex,j,i+1)-tex2D(tex,j,i-1);
  __syncthreads();
  j += blockDim.y * gridDim.y;
  }
  i += blockDim.x * gridDim.x;
  }
}
""")

########
#get the kernel
########
copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")

#########
#Map the Kernel to texture object
#########
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(realrow , texref , order = "C")

#texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
#texref.set_filter_mode()

gpu_output = np.zeros_like(realrow)
copy_texture_func(cuda.In(np.float32([M,N])),cuda.Out(gpu_output),block=(32,32, 1), grid=(M/32,N/32,1), texrefs=[texref])

p.gray()
p.subplot(1,2,1)
p.imshow(realrow)
p.subplot(1,2,2)
p.imshow(gpu_output)
p.show()

然后报了跟我一样的错，然后他修改：

copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")

#########
#Map the Kernel to texture object
#########
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(realrow , texref , order = "C")

#texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
#texref.set_filter_mode()

gpu_output = np.zeros_like(realrow)
copy_texture_func(cuda.In(np.float32([M,N])),cuda.Out(gpu_output),\
                  block=(32,32, 1), grid=(M//32,N//32,1), texrefs=[texref])
p.rcParams['figure.figsize'] = 30, 30
plt.rcParams["font.size"] = "18"
p.gray()
p.subplot(1,2,1)
p.imshow(realrow)
p.subplot(1,2,2)
p.imshow(gpu_output)
p.show()

改成这样就OK了。其实就是grid=(M/32,N/32,1) → grid=(M//32,N//32,1) 我也试试看。果然是这个问题。