cuda编程部分基本和c++上是一致的
可参考c++版的:
CUDA编程基本入门学习笔记
看懂上面链接之后就很好懂numba的python代码了
下面直接放代码了:
from numba import cuda ,vectorize
import numpy as np
import math
from timeit import default_timer as timer
def func_cpu(a,b,c,th):
for y in range(a.shape[0]):
for x in range(a.shape[1]):
diff = 0
if a[y][x] > b[y][x]:
diff = a[y][x] - b[y][x]
else:
diff = b[y][x] - a[y][x]
'''
if diff > th:
c[y][x] = 255
else:
c[y][x] = 0
'''
@cuda.jit
def func_gpu(a,b,c):
x_max = a.shape[0]
startX = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
gridX = cuda.gridDim.x * cuda.blockDim.x;
for x in range(startX, x_max, gridX):
c[x]=(a[x]-b[x])*(a[x]-b[x])
@cuda.jit
def func_gpu_2(a,b,c,d_th):
y_max = a.shape[0]
x_max = a.shape[1]
startX = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
startY = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
gridX = cuda.gridDim.x * cuda.blockDim.x;
gridY = cuda.gridDim.y * cuda.blockDim.y;
for y in range(startY, y_max, gridY):
for x in range(startX, x_max, gridX):
diff = 0
if a[y][x] > b[y][x]:
diff = a[y][x] - b[y][x]
else:
diff = b[y][x] - a[y][x]
'''x
if diff > d_th[0]:
c[y][x] = 255
else:
c[y][x] = 0
'''
@vectorize(["float32 (float32 , float32 )"], target='cuda')
def func_gpu_3(a,b):
#x=np.maximum(1,2)
return a-b
def main():
N = 128
for i in range(10):
A = np.ones((N,N), dtype=np.float32 )
B = np.ones((N,N), dtype=np.float32 )
A = A*3
C = np.zeros((N,N), dtype=np.float32 )
start = timer()
th = 2
func_cpu(A, B,C,th)
vectorAdd_time = timer() - start
#print(C)
print("CPU took %f ms " % (vectorAdd_time*1000))
print("------------------------------------")
for i in range(10):
start = timer()
blockdim = (32, 8)
griddim = (32, 16)
th = 2
th_arr = np.zeros(1, dtype=np.uint8)
th_arr[0] = 2
d_A = cuda.to_device(A)
d_B = cuda.to_device(B)
d_C = cuda.to_device(C)
d_th = cuda.to_device(th_arr)
func_gpu_2[griddim, blockdim](d_A,d_B,d_C,d_th)
C = d_C.copy_to_host()
#print(C)
vectorAdd_time = timer() - start
print("GPU took %f ms" % (vectorAdd_time*1000))
print("------------------------------------")
for i in range(10):
start = timer()
C = func_gpu_3(A,B)
# print(C)
vectorAdd_time = timer() - start
print("vectorize GPU took %f ms" % (vectorAdd_time * 1000))
if __name__ == '__main__':
main()