一个 Grid 分成 按维度分成多个Block,个数为 GridDim.x * GridDim.y
遍历: blockIdx.x , blockIdx.y
一个Block 按维度分成多个Thread,个数为 BlockDim.x * BlockDim.y
Thread 是最小的运行单元
遍历:threadIdx.x , threadIdx.y
图像处理中,一个像素对应到一个thread 中。
从 thread 映射 到 图像 pix 方式 :
ix = blockIdx.x*blockDim.x + threadIdx.x;
iy = blockIdx.y*blockDim.y + threadIdx.y;
# -*- coding: utf-8 -*-
import cv2
import numpy as np
from numba import cuda
import time
import math
#GPU function
@cuda.jit
def process_gpu(img,channels):
tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
ty = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
for c in range(channels):
color = img[tx,ty][c]*2.0 + 30
if color>255:
img[tx,ty][c] = 255
elif color < 0:
img[tx,ty][c] = 0
else:
img[tx,ty][c] = color
#cpu function
def process_cpu(img,channels):
rows,cols,channels = img.shape
for i in range(rows):
for j in range(cols):
for c in range(channels):
color = img[i,j][c]*2.0 + 30
if color>255:
img[i,j][c] = 255
elif color < 0:
img[i,j][c] = 0
else:
img[i,j][c] = color
if __name__ == "__main__":
# 创建图像
img = cv2.imread("DSC00070.jpg")
# print(img)
rows,cols,channels = img.shape
dst_cpu = img.copy()
dst_gpu = img.copy()
start_cpu = time.time()
process_cpu(dst_cpu,channels)
end_cpu = time.time()
time_cpu = end_cpu - start_cpu
print("CPU process time: "+str(time_cpu))
# GPU function
dImg = cuda.to_device(img)
threadsperblock = (16,16)
blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))
blockspergrid = (blockspergrid_x,blockspergrid_y)
cuda.synchronize()
start_gpu = time.time()
process_gpu[blockspergrid,threadsperblock](dImg,channels)
cuda.synchronize()
end_gpu = time.time()
dst_gpu = dImg.copy_to_host()
time_gpu = end_gpu - start_gpu
print("GPU process time: "+str(time_gpu))
# print(type(dst_cpu))
cv2.imwrite("result_cpu.jpg", dst_cpu)
cv2.imwrite("result_gpu.jpg", dst_gpu)
print("Done.")
结果:
CPU process time: 523.7896401882172
GPU process time: 0.17055010795593262
Done.
原图:
处理后结果: