tx = cuda.threadIdx.x
bx = cuda.blockIdx.x
bw = cuda.blockDim.x
i = tx + bx * bw
array[i] = something(i)
i = cuda.grid(1)
array[i] = something(i)
stream = cuda.stream()
devary = cuda.to_device(an_array, stream=stream)
a_cuda_kernel[griddim, blockdim, stream](devary)
cuda.copy_to_host(an_array, stream=stream)
# 在an_array中的数据可能尚未就绪
stream.synchronize()
# an_array中的数据已经就绪
stream = cuda.stream()
with stream.auto_synchronize():
devary = cuda.to_device(an_array, stream=stream)
a_cuda_kernel[griddim, blockdim, stream](devary)
devary.copy_to_host(an_array, stream=stream)
# an_array中的数据已经就绪
bpg = 50
tpb = 32
n = bpg * tpb
@jit(argtypes=[float32[:,:], float32[:,:], float32[:,:]], target='gpu')
def cu_square_matrix_mul(A, B, C):
sA = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
bw = cuda.blockDim.x
bh = cuda.blockDim.y
x = tx + bx * bw
y = ty + by * bh
acc = 0.
for i in range(bpg):
if x < n and y < n:
sA[ty, tx] = A[y, tx + i * tpb]
sB[ty, tx] = B[ty + i * tpb, x]
cuda.syncthreads()
if x < n and y < n:
for j in range(tpb):
acc += sA[ty, j] * sB[j, tx]
cuda.syncthreads()
if x < n and y < n:
C[y, x] = acc