PYTHON调用GPU之HelloWorld
import cv2
import numpy as np
from numba import cuda
import time
import math
import os
@cuda.jit
def process_gpu(img,channels):
tx = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
ty = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y
for k in range(channels):
color = img[tx,ty][k]*2.0 + 30
if color > 255:
img[tx,ty][k]=255
elif color < 0:
img[tx,ty][k] = 0
else:
img[tx,ty][k] = color
def process_cpu(img,dst):
rows,cols,channels = img.shape
for i in range(rows):
for j in range(cols):
for k in range(channels):
color = img[i,j][k] * 2.0 + 30
if color > 255:
dst[i,j][k] = 255
elif color < 0:
dst[i,j][k] = 0
else:
dst[i,j][k] = color
if __name__ == '__main__':
img = cv2.imread("test-nvidia.png")
rows,cols,channels = img.shape
print(rows,cols,channels)
dst_cpu = img.copy()
dst_gpu = img.copy()
start_cpu = time.time()
end_cpu = time.time()
print("CPU process time: " + str(end_cpu - start_cpu))
dImg = cuda.to_device(img)
threadsperblock = (16,16)
blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))
print(blockspergrid_x,blockspergrid_y)
blockspergrid = (blockspergrid_x,blockspergrid_y)
cuda.synchronize()
start_gpu = time.time()
process_gpu[blockspergrid,threadsperblock](dImg,channels)
cuda.synchronize()
end_gpu = time.time()
dst_gpu = dImg.copy_to_host()
print("GPU Process time: " + str(end_gpu - start_gpu))
cv2.imwrite("result_gpu.png",dst_gpu)
from numba import cuda,float32
import numpy as np
import math
TPB = 16
@cuda.jit
def gpu_matrix_multiply(matrix1,matrix2,res_matrix):
row,col = cuda.grid(2)
sum = 0
for i in range(matrix1.shape[1]):
sum += matrix1[row,i] * matrix2[i,col]
res_matrix[row,col] = sum
@cuda.jit
def gpu_share_matrix_multiply(matrix1,matrix2,res_matrix):
sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
sB = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
row, col = cuda.grid(2)
if row >= matrix1.shape[0] or col >= matrix2.shape[1]:
return
thread_x = cuda.threadIdx.x
thread_y = cuda.threadIdx.y
tmp = 0.0
for i in range(int(matrix1.shape[1]/TPB)):
sA[thread_x,thread_y] = matrix1[row, thread_y+i*TPB]
sB[thread_y,thread_x] = matrix2[thread_x+i*TPB, col]
cuda.syncthreads()
for j in range(TPB):
tmp += sA[thread_x, j] * sB[j, thread_y]
cuda.syncthreads()
res_matrix[row,col] = tmp
if __name__ == '__main__':
A = np.full((TPB*50, TPB*50), 3, np.float)
B = np.full((TPB*50, TPB*50), 4, np.float)
C = np.full((A.shape[0], B.shape[1]), 0, np.float)
D = np.full((A.shape[0], B.shape[1]), 0, np.float)
print(".......")
print(A.dot(B))
threadPerBlock = (TPB, TPB)
blockPerGrid_x = math.ceil(A.shape[0]/TPB)
blockPerGrid_y = math.ceil(B.shape[1]/TPB)
blockPerGrid = (blockPerGrid_x, blockPerGrid_y)
A_gpu = cuda.to_device(A)
B_gpu = cuda.to_device(B)
C_gpu = cuda.to_device(C)
D_gpu = cuda.to_device(D)
gpu_matrix_multiply[blockPerGrid, threadPerBlock](A_gpu, B_gpu, C_gpu)
cuda.synchronize()
C_gpu_res = C_gpu.copy_to_host()
print("..........")
print(C_gpu_res)
print("..........")
gpu_share_matrix_multiply[blockPerGrid, threadPerBlock](A_gpu, B_gpu, D_gpu)
D_gpu_res = D_gpu.copy_to_host()
print(D_gpu_res)