cuda编程python接口_Python GPU编程之入门篇

tx = cuda.threadIdx.x

bx = cuda.blockIdx.x

bw = cuda.blockDim.x

i = tx + bx * bw

array[i] = something(i)

i = cuda.grid(1)

array[i] = something(i)

stream = cuda.stream()

devary = cuda.to_device(an_array, stream=stream)

a_cuda_kernel[griddim, blockdim, stream](devary)

cuda.copy_to_host(an_array, stream=stream)

# 在an_array中的数据可能尚未就绪

stream.synchronize()

# an_array中的数据已经就绪

stream = cuda.stream()

with stream.auto_synchronize():

devary = cuda.to_device(an_array, stream=stream)

a_cuda_kernel[griddim, blockdim, stream](devary)

devary.copy_to_host(an_array, stream=stream)

# an_array中的数据已经就绪

bpg = 50

tpb = 32

n = bpg * tpb

@jit(argtypes=[float32[:,:], float32[:,:], float32[:,:]], target='gpu')

def cu_square_matrix_mul(A, B, C):

sA = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

tx = cuda.threadIdx.x

ty = cuda.threadIdx.y

bx = cuda.blockIdx.x

by = cuda.blockIdx.y

bw = cuda.blockDim.x

bh = cuda.blockDim.y

x = tx + bx * bw

y = ty + by * bh

acc = 0.

for i in range(bpg):

if x < n and y < n:

sA[ty, tx] = A[y, tx + i * tpb]

sB[ty, tx] = B[ty + i * tpb, x]

cuda.syncthreads()

if x < n and y < n:

for j in range(tpb):

acc += sA[ty, j] * sB[j, tx]

cuda.syncthreads()

if x < n and y < n:

C[y, x] = acc

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值