cuda编程python接口_CUDA共享内存问题(以及将CUDA与python / ctypes一...

(请注意,此答案中的代码还提供了有关如何在与使用python ctypes的python应用程序共享的库中使用CUDA代码(例如CUDA设备内核)的完整秘诀/示例.如果您希望使用CUDA库功能,答案here提供了一个使用python ctypes的示例.)

这里的问题是内核正在写越界,并且显然编译器/运行时将分配定位在设备内存中足够近的位置,这超出了第一个分配的界限,导致代码写入了第二个分配:

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map

cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

越界访问即将到来是因为内核启动涉及的线程数量过多(在这种情况下,它正在启动1024个线程),而我们实际上仅“需要” SIZE_X * SIZE_Y线程(在此示例中为16):

#define blockSize 1024

...

int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;

...

update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

当然,这在CUDA编程中是“典型的”,以启动足够多的线程,但是在执行此操作时,在内核中包括“线程检查”,以防止任何“多余的”线程使任何非法的,过时的线程变得很重要.边界访问.在这种情况下,可能的内核线程检查可能是这样的:

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

这是一个基于提供的代码的完整示例(尽管在Linux上,并且在python代码中删除了Blender依赖项),显示了之前和之后的效果.请注意,我们甚至可以使用cuda-memcheck运行这样的代码,在这种情况下,该代码将指出越界访问(为清晰起见,在下面的第一个示例中省略了该访问):

$cat t383.cu

extern "C"

void init(float *t_height_map,

float *w_height_map,

float *s_height_map,

int SIZE_X,

int SIZE_Y);

extern "C"

void run_hydro_erosion(int cycles,

float t_step,

float min_tilt_angle,

float SEDIMENT_CAP,

float DISSOLVE_CONST,

float DEPOSIT_CONST,

int SIZE_X,

int SIZE_Y,

float PIPE_LENGTH,

float ADJACENT_LENGTH,

float TIME_STEP,

float MIN_TILT_ANGLE);

extern "C"

void free_mem();

extern "C"

void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

// includes, system

#include

#include

#include

#include

#include

#include

#include

#include

// includes CUDA

#include

using namespace std;

#define FLOW_RIGHT 0

#define FLOW_UP 1

#define FLOW_LEFT 2

#define FLOW_DOWN 3

#define X_VEL 0

#define Y_VEL 1

#define LEFT_CELL row, col - 1

#define RIGHT_CELL row, col + 1

#define ABOVE_CELL row - 1, col

#define BELOW_CELL row + 1, col

// CUDA API error checking macro

#define T 1024

#define M 1536

#define blockSize 1024

#define cudaCheck(error) \n if (error != cudaSuccess) { \n printf("Fatal error: %s at %s:%d

", \n cudaGetErrorString(error), \n __FILE__, __LINE__); \n exit(1); \n }

__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)

{

int index = blockIdx.x * blockDim.x + threadIdx.x;

int col = index % SIZE_X;

int row = index / SIZE_X;

index = row * (SIZE_X * 4) + col * 4; // 3D index

#ifdef FIX

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

#endif

d_updated_water_flow_map[index + FLOW_RIGHT] = 0;

d_updated_water_flow_map[index + FLOW_UP] = 0;

d_updated_water_flow_map[index + FLOW_LEFT] = 0;

d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;

static float *water_height_map;

static float *sediment_height_map;

void init(float *t_height_map,

float *w_height_map,

float *s_height_map,

int SIZE_X,

int SIZE_Y)

{

/* set vars HOST*/

terrain_height_map = t_height_map;

water_height_map = w_height_map;

sediment_height_map = s_height_map;

}

void run_hydro_erosion(int cycles,

float t_step,

float min_tilt_angle,

float SEDIMENT_CAP,

float DISSOLVE_CONST,

float DEPOSIT_CONST,

int SIZE_X,

int SIZE_Y,

float PIPE_LENGTH,

float ADJACENT_LENGTH,

float TIME_STEP,

float MIN_TILT_ANGLE)

{

int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;

int SIZE = SIZE_X * SIZE_Y * sizeof(float);

float *d_terrain_height_map, *d_updated_terrain_height_map;

float *d_water_height_map, *d_updated_water_height_map;

float *d_sediment_height_map, *d_updated_sediment_height_map;

float *d_suspended_sediment_level;

float *d_updated_suspended_sediment_level;

float *d_water_flow_map;

float *d_updated_water_flow_map;

float *d_prev_water_height_map;

float *d_water_velocity_vec;

float *d_rain_map;

cudaCheck(cudaMalloc(&d_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map

cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));

cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));

cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));

cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));

cudaCheck(cudaMalloc(&d_rain_map, SIZE));

cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));

cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));

cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

cout << "init terrain_height_map" << endl;

for (int i = 0; i < SIZE_X * SIZE_Y; i++) {

cout << terrain_height_map[i] << ", ";

if (i % SIZE_X == 0 && i != 0) cout << endl;

}

/* launch the kernel on the GPU */

float *temp;

while (cycles--) {

update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

temp = d_water_flow_map;

d_water_flow_map = d_updated_water_flow_map;

d_updated_water_flow_map = temp;

}

cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));

cout << "updated terrain" << endl;

for (int i = 0; i < SIZE_X * SIZE_Y; i++) {

cout << terrain_height_map[i] << ", ";

if (i % SIZE_X == 0 && i != 0) cout << endl;

}

}

$cat t383.py

import numpy

import ctypes

import random

width = 4

height = 4

size_x = width

size_y = height

N = size_x * size_y

scrpt_cycles = 1

kernel_cycles = 1

time_step = 0.005

pipe_length = 1.0

adjacent_length = 1.0

min_tilt_angle = 10

sediment_cap = 0.01

dissolve_const = 0.01

deposit_const = 0.01

# initialize arrays

ter_height_map = numpy.ones((N), dtype=numpy.float32)

water_height_map = numpy.zeros((N), dtype=numpy.float32)

sed_height_map = numpy.zeros((N), dtype=numpy.float32)

rain_map = numpy.ones((N), dtype=numpy.float32)

# load terrain height from image

for i in range(0, len(ter_height_map)):

ter_height_map[i] = 1

# import DLL

E = ctypes.cdll.LoadLibrary("./t383.so")

# initialize device memory

E.init( ctypes.c_void_p(ter_height_map.ctypes.data),

ctypes.c_void_p(water_height_map.ctypes.data),

ctypes.c_void_p(sed_height_map.ctypes.data),

ctypes.c_int(size_x),

ctypes.c_int(size_y))

# run erosion

while(scrpt_cycles):

scrpt_cycles = scrpt_cycles - 1

E.run_hydro_erosion(ctypes.c_int(kernel_cycles),

ctypes.c_float(time_step),

ctypes.c_float(min_tilt_angle),

ctypes.c_float(sediment_cap),

ctypes.c_float(dissolve_const),

ctypes.c_float(deposit_const),

ctypes.c_int(size_x),

ctypes.c_int(size_y),

ctypes.c_float(pipe_length),

ctypes.c_float(adjacent_length),

ctypes.c_float(time_step),

ctypes.c_float(min_tilt_angle))

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu

$python t383.py

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, updated terrain

0, 0, 0, 0, 0,

0, 0, 0, 0,

0, 0, 0, 0,

0, 0, 0,

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu -DFIX

$cuda-memcheck python t383.py

========= CUDA-MEMCHECK

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, updated terrain

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1,

========= ERROR SUMMARY: 0 errors

$

如果我们编译前一个没有修复的示例,但是使用cuda-memcheck运行它,我们将获得指示越界访问的输出:

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu

$cuda-memcheck python t383.py

========= CUDA-MEMCHECK

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

========= Invalid __global__ write of size 4

========= at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)

========= by thread (31,0,0) in block (0,0,0)

========= Address 0x1050d6009f0 is out of bounds

========= Saved host backtrace up to driver entry point at kernel launch time

========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]

========= Host Frame:./t383.so [0x1c291]

========= Host Frame:./t383.so [0x39e33]

========= Host Frame:./t383.so [0x6879]

========= Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]

========= Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]

========= Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]

========= Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]

========= Host Frame:python [0x167d14]

========= Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]

========= Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]

========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]

========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]

========= Host Frame:python [0x177c2e]

=========

========= Invalid __global__ write of size 4

========= at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)

========= by thread (30,0,0) in block (0,0,0)

========= Address 0x1050d6009e0 is out of bounds

========= Saved host backtrace up to driver entry point at kernel launch time

========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]

========= Host Frame:./t383.so [0x1c291]

========= Host Frame:./t383.so [0x39e33]

========= Host Frame:./t383.so [0x6879]

========= Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]

========= Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]

========= Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]

========= Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]

========= Host Frame:python [0x167d14]

========= Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]

========= Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]

========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]

========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]

========= Host Frame:python [0x177c2e]

=========

... (output truncated for brevity of presentation)

========= ERROR SUMMARY: 18 errors

$

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值