numba 入门示例

一维向量求和:  C = A + B

在有nv 近几年gpu的ubuntu 机器上,

环境预备:

conda create -name numba_cuda_python3.10 python=3.10
conda activate numba_cuda_python3.10

conda install numba
conda install cudatoolkit
conda install -c nvidia cuda-python
	or   $ conda install nvidia::cuda-python

示例1:源代码

C[i] = A[i] + B[i]

hello_numba_cpu_01.py

import time
import numpy as np
from numba import jit
from numba import njit

def f_py(a, b, c, N):
    for i in range(N):
        c[i] = a[i] + b[i]

@jit
def f_bin(a, b, c, N):
    for i in range(N):
        c[i] = a[i] + b[i]

@njit
def f_pure_bin(a, b, c, N):
    for i in range(N):
        c[i] = a[i] + b[i]


if __name__ == "__main__":
    np.random.seed(1234)
    N = 1024*1024*128
    a_h = np.random.random(N)
    b_h = np.random.random(N)
    c_h1 = np.random.random(N)
    c_h2 = np.random.random(N)
    c_h3 = np.random.random(N)
    f_bin(a_h, b_h, c_h1, N)
    print('a_h  =', a_h)
    print('b_h  =', b_h)
    print('c_h1 =', c_h1)
    #c_h = np.random.random(N)
    #print('c_h =', c_h)
    f_pure_bin(a_h, b_h, c_h2, N)
    print('c_h2 =', c_h2)

    s1 = time.time()
    f_py(a_h, b_h, c_h1, N)
    e1 = time.time()
    print('time   py:',e1 - s1)

    s1 = time.time()
    f_bin(a_h, b_h, c_h2, N)
    e1 = time.time()
    print('time  jit:',e1 - s1)

    s1 = time.time()
    f_pure_bin(a_h, b_h, c_h3, N)
    e1 = time.time()
    print('time njit:',e1 - s1)

    print('c_h1 =', c_h1)
    print('c_h2 =', c_h2)
    print('c_h3 =', c_h3)

运行时间,纯python是26s,jit是0.23s:

 

示例2:源代码

C[i] = A[i] + B[i]

hello_numba_gpu_02.py

import time
import numpy as np
from numba import jit
from numba import njit
from numba import cuda

def f_py(a, b, c, N):
    for i in range(N):
        c[i] = a[i] + b[i]

@jit
def f_bin(a, b, c, N):
    for i in range(N):
        c[i] = a[i] + b[i]

@njit
def f_pure_bin(a, b, c, N):
    for i in range(N):
        c[i] = a[i] + b[i]

@cuda.jit
def f_gpu(a, b, c):
    # like threadIdx.x + (blockIdx.x * blockDim.x)
    tid = cuda.grid(1)
    size = len(c)

    if tid < size:
        c[tid] = a[tid] + b[tid]


if __name__ == "__main__":
    np.random.seed(1234)
#    M = np.random.random([int(4e3)] * 2)
    N = 1024*1024*128
    a_d = cuda.to_device(np.random.random(N))
    b_d = cuda.to_device(np.random.random(N))
    c_d = cuda.device_array_like(a_d)
    print('a_d =', a_d.copy_to_host())
    print('b_d =', b_d.copy_to_host())
    print('c_d =', c_d.copy_to_host())

    a_h = a_d.copy_to_host()
    b_h = b_d.copy_to_host()
    c_h = c_d.copy_to_host()
    f_bin(a_h, b_h, c_h, N)
    print('a_h =', a_h)
    print('b_h =', b_h)
    print('c_h =', c_h)
    c_h = np.random.random(N)
    #print('c_h =', c_h)
    f_pure_bin(a_h, b_h, c_h, N)
    print('c_h =', c_h)

    f_gpu.forall(len(a_d))(a_d, b_d, c_d)
    print('c_d =', c_d.copy_to_host())

    # Enough threads per block for several warps per block
    nthreads = 256
    # Enough blocks to cover the entire vector depending on its length
    nblocks = (len(a_d) // nthreads) + 1
    f_gpu[nblocks, nthreads](a_d, b_d, c_d)
    print('c_d =', c_d.copy_to_host())

    s1 = time.time()
    f_py(a_h, b_h, c_h, N)
    e1 = time.time()
    print('time   py:',e1 - s1)

    s1 = time.time()
    f_bin(a_h, b_h, c_h, N)
    e1 = time.time()
    print('time  jit:',e1 - s1)

    s1 = time.time()
    f_pure_bin(a_h, b_h, c_h, N)
    e1 = time.time()
    print('time njit:',e1 - s1)

    s1 = time.time()
    f_gpu.forall(len(a_d))(a_d, b_d, c_d)
    e1 = time.time()
    print('time gpu1:',e1 - s1)

    s1 = time.time()
    f_gpu[nblocks, nthreads](a_d, b_d, c_d)
    e1 = time.time()
    print('time gpu2:',e1 - s1)

gpu的加速非常明显,N万倍:

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值