yolov5训练并生成rknn模型部署在RK3588开发板上,实现NPU加速推理
RK3588实战:调用npu加速,yolov5识别图像、ffmpeg发送到rtmp服务器
RK3588 npu python运行 YOLOv8 和 YOLOv8-seg 的教程
国内gitee
RK3588 rknpu2及rknn-toolkit2使用说明
香橙派5使用RK3588S内置NPU加速yolov5推理,实时识别数字达到50fps
多线程异步提高RK3588的NPU占用率,进而提高yolov5s帧率
文章和代码使用yolov5s进行讲解, 其他模型如resnet之类的同理,稍作修改就可以使用。 由于已经有很多人,如蓝灵风, 孙启尧等做了如何通过修改模型提高视频推理帧率的教程, 这里我就主要讲另外一种性能的方法——多线程异步。
模型修改原理
yolov5s模型激活函数为silu, 此激活函数量化类型为float16, 导致推理过程中使用CPU进行计算, 量化效果较糟。 将激活函数换为relu, 可以在牺牲一点精度的情况下获得巨大性能提升, 目前测试约为80 - 83帧, c++优化后或许有上百?
详情可看蓝灵风大佬的演示视频
多线程异步原理
查看NPU占用率的命令
sudo cat /sys/kernel/debug/rknpu/load
在运行官方demo时我们可以发现,推理过程中NPU使用率较低。
翻阅官方手册后得知,尽管rk官方有提供函数rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2),使得RKNN模型能使用三核心进行推演,但NPU的使用率仍处于一个较低的水平,如果简单的使用以上函数初始化多个使用多核的rknn模型,在推理过程中就会发生内存泄漏/越界导致系统崩溃
然而使用以下函数初始化多个使用单核的模型却可以完美运行,那么我们接下来的目标就是通过这一方法初始化自己的rknn线程池
rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_2)
首先设计一个初始化rknn模型的函数
def initRKNN(rknnModel="./rknnModel/yolov5s.rknn", id=0):
rknn_lite = RKNNLite()
ret = rknn_lite.load_rknn(rknnModel)
if ret != 0:
print("Load RKNN rknnModel failed")
exit(ret)
if id == 0:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
elif id == 1:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
elif id == 2:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_2)
elif id == -1:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2)
else:
ret = rknn_lite.init_runtime()
if ret != 0:
print("Init runtime environment failed")
exit(ret)
print(rknnModel, "\t\tdone")
return rknn_lite
初始化多个rknn模型为一个rknn对象列表,以备后面调用
def initRKNNs(rknnModel="./rknnModel/yolov5s.rknn", TPEs=1):
rknn_list = []
for i in range(TPEs):
rknn_list.append(initRKNN(rknnModel, i % 3))
return rknn_list
最后结合python官方库里的线程池写一个自己的rknn池
class rknnPoolExecutor():
def __init__(self, rknnModel, TPEs, func):
self.TPEs = TPEs
self.queue = Queue()
self.rknnPool = initRKNNs(rknnModel, TPEs)
self.pool = ThreadPoolExecutor(max_workers=TPEs)
self.func = func
self.num = 0
def put(self, frame):
self.queue.put(self.pool.submit(
self.func, self.rknnPool[self.num % self.TPEs], frame))
self.num += 1
def get(self):
if self.queue.empty():
return None, False
temp = []
temp.append(self.queue.get())
for frame in as_completed(temp):
return frame.result(), True
def release(self):
self.pool.shutdown()
for rknn_lite in self.rknnPool:
rknn_lite.release()
由于我们要处理的是视频,一个有着严格时间循序的对象,所以我们这里需要提前向线程池中输入几帧,以达到异步的操作(这里的myFunc是一个函数对象,对输入进行处理、推演和绘制目标框体,最后返回目标图像)
# 线程数
TPEs = 6
# 初始化rknn池
pool = rknnPoolExecutor(
rknnModel=modelPath,
TPEs=TPEs,
func=myFunc)
# 初始化异步所需要的帧
if (cap.isOpened()):
for i in range(TPEs + 1):
ret, frame = cap.read()
if not ret:
cap.release()
del pool
exit(-1)
pool.put(frame)
然后便可以对视频进行正常推理
frames, loopTime, initTime = 0, time.time(), time.time()
while (cap.isOpened()):
frames += 1
ret, frame = cap.read()
if not ret:
break
pool.put(frame)
frame, flag = pool.get()
if flag == False:
break
cv2.imshow('test', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
if frames % 30 == 0:
print("30帧平均帧率:\t", 30 / (time.time() - loopTime), "帧")
loopTime = time.time()
最后释放所有资源,防止NPU内存问题
print("总平均帧率\t", frames / (time.time() - initTime))
# 释放cap和rknn线程池
cap.release()
cv2.destroyAllWindows()
pool.release()
这是在不同线程数下视频推理的帧率:
测试模型来源 yolov5s,激活函数为silu(非relu优化版本)
resnet18_for_rk3588, resnet26, resnet50
yolov5s在6线程下NPU利用率仅有50 - 60%左右, 性能劣化原因猜想:
python的GIL为伪多线程, 换为c++或许在8线程前仍有较大提升
rk3588的CPU性能跟不上, 对OpenCV绘框部分做c++优化或许有提升
完整代码
可移步rknn多线程获取yolov5s, resnet26, resnet50的rknn模型、完整代码和演示视频
main.py
import cv2
import time
from rknnpool import rknnPoolExecutor
# 图像处理函数,实际应用过程中需要自行修改
from func import myFunc
cap = cv2.VideoCapture('./video/islandBenchmark.mp4')
# cap = cv2.VideoCapture(0)
modelPath = "./rknnModel/yolov5s.rknn"
# 线程数
TPEs = 6
# 初始化rknn池
pool = rknnPoolExecutor(
rknnModel=modelPath,
TPEs=TPEs,
func=myFunc)
# 初始化异步所需要的帧
if (cap.isOpened()):
for i in range(TPEs + 1):
ret, frame = cap.read()
if not ret:
cap.release()
del pool
exit(-1)
pool.put(frame)
frames, loopTime, initTime = 0, time.time(), time.time()
while (cap.isOpened()):
frames += 1
ret, frame = cap.read()
if not ret:
break
pool.put(frame)
frame, flag = pool.get()
if flag == False:
break
cv2.imshow('test', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
if frames % 30 == 0:
print("30帧平均帧率:\t", 30 / (time.time() - loopTime), "帧")
loopTime = time.time()
print("总平均帧率\t", frames / (time.time() - initTime))
# 释放cap和rknn线程池
cap.release()
cv2.destroyAllWindows()
pool.release()
rknnpool.py
from queue import Queue
from rknnlite.api import RKNNLite
from concurrent.futures import ThreadPoolExecutor, as_completed
def initRKNN(rknnModel="./rknnModel/yolov5s.rknn", id=0):
rknn_lite = RKNNLite()
ret = rknn_lite.load_rknn(rknnModel)
if ret != 0:
print("Load RKNN rknnModel failed")
exit(ret)
if id == 0:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
elif id == 1:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
elif id == 2:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_2)
elif id == -1:
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2)
else:
ret = rknn_lite.init_runtime()
if ret != 0:
print("Init runtime environment failed")
exit(ret)
print(rknnModel, "\t\tdone")
return rknn_lite
def initRKNNs(rknnModel="./rknnModel/yolov5s.rknn", TPEs=1):
rknn_list = []
for i in range(TPEs):
rknn_list.append(initRKNN(rknnModel, i % 3))
return rknn_list
class rknnPoolExecutor():
def __init__(self, rknnModel, TPEs, func):
self.TPEs = TPEs
self.queue = Queue()
self.rknnPool = initRKNNs(rknnModel, TPEs)
self.pool = ThreadPoolExecutor(max_workers=TPEs)
self.func = func
self.num = 0
def put(self, frame):
self.queue.put(self.pool.submit(
self.func, self.rknnPool[self.num % self.TPEs], frame))
self.num += 1
def get(self):
if self.queue.empty():
return None, False
temp = []
temp.append(self.queue.get())
for frame in as_completed(temp):
return frame.result(), True
def release(self):
self.pool.shutdown()
for rknn_lite in self.rknnPool:
rknn_lite.release()
func.py
#以下代码改自https://github.com/rockchip-linux/rknn-toolkit2/tree/master/examples/onnx/yolov5
import cv2
import numpy as np
from rknnlite.api import RKNNLite
QUANTIZE_ON = True
OBJ_THRESH, NMS_THRESH, IMG_SIZE = 0.25, 0.45, 640
CLASSES = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
"fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
"bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
"spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
"pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop ", "mouse ", "remote ", "keyboard ", "cell phone", "microwave ",
"oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush ")
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def xywh2xyxy(x):
# Convert [x, y, w, h] to [x1, y1, x2, y2]
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def process(input, mask, anchors):
anchors = [anchors[i] for i in mask]
grid_h, grid_w = map(int, input.shape[0:2])
box_confidence = sigmoid(input[..., 4])
box_confidence = np.expand_dims(box_confidence, axis=-1)
box_class_probs = sigmoid(input[..., 5:])
box_xy = sigmoid(input[..., :2])*2 - 0.5
col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
grid = np.concatenate((col, row), axis=-1)
box_xy += grid
box_xy *= int(IMG_SIZE/grid_h)
box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
box_wh = box_wh * anchors
box = np.concatenate((box_xy, box_wh), axis=-1)
return box, box_confidence, box_class_probs
def filter_boxes(boxes, box_confidences, box_class_probs):
"""Filter boxes with box threshold. It's a bit different with origin yolov5 post process!
# Arguments
boxes: ndarray, boxes of objects.
box_confidences: ndarray, confidences of objects.
box_class_probs: ndarray, class_probs of objects.
# Returns
boxes: ndarray, filtered boxes.
classes: ndarray, classes for boxes.
scores: ndarray, scores for boxes.
"""
boxes = boxes.reshape(-1, 4)
box_confidences = box_confidences.reshape(-1)
box_class_probs = box_class_probs.reshape(-1, box_class_probs.shape[-1])
_box_pos = np.where(box_confidences >= OBJ_THRESH)
boxes = boxes[_box_pos]
box_confidences = box_confidences[_box_pos]
box_class_probs = box_class_probs[_box_pos]
class_max_score = np.max(box_class_probs, axis=-1)
classes = np.argmax(box_class_probs, axis=-1)
_class_pos = np.where(class_max_score >= OBJ_THRESH)
boxes = boxes[_class_pos]
classes = classes[_class_pos]
scores = (class_max_score * box_confidences)[_class_pos]
return boxes, classes, scores
def nms_boxes(boxes, scores):
"""Suppress non-maximal boxes.
# Arguments
boxes: ndarray, boxes of objects.
scores: ndarray, scores of objects.
# Returns
keep: ndarray, index of effective boxes.
"""
x = boxes[:, 0]
y = boxes[:, 1]
w = boxes[:, 2] - boxes[:, 0]
h = boxes[:, 3] - boxes[:, 1]
areas = w * h
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x[i], x[order[1:]])
yy1 = np.maximum(y[i], y[order[1:]])
xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])
w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
inter = w1 * h1
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= NMS_THRESH)[0]
order = order[inds + 1]
keep = np.array(keep)
return keep
def yolov5_post_process(input_data):
masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
[59, 119], [116, 90], [156, 198], [373, 326]]
boxes, classes, scores = [], [], []
for input, mask in zip(input_data, masks):
b, c, s = process(input, mask, anchors)
b, c, s = filter_boxes(b, c, s)
boxes.append(b)
classes.append(c)
scores.append(s)
boxes = np.concatenate(boxes)
boxes = xywh2xyxy(boxes)
classes = np.concatenate(classes)
scores = np.concatenate(scores)
nboxes, nclasses, nscores = [], [], []
for c in set(classes):
inds = np.where(classes == c)
b = boxes[inds]
c = classes[inds]
s = scores[inds]
keep = nms_boxes(b, s)
nboxes.append(b[keep])
nclasses.append(c[keep])
nscores.append(s[keep])
if not nclasses and not nscores:
return None, None, None
boxes = np.concatenate(nboxes)
classes = np.concatenate(nclasses)
scores = np.concatenate(nscores)
return boxes, classes, scores
def draw(image, boxes, scores, classes):
for box, score, cl in zip(boxes, scores, classes):
top, left, right, bottom = box
# print('class: {}, score: {}'.format(CLASSES[cl], score))
# print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
top = int(top)
left = int(left)
right = int(right)
bottom = int(bottom)
cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score),
(top, left - 6),
cv2.FONT_HERSHEY_SIMPLEX,
0.6, (0, 0, 255), 2)
def letterbox(im, new_shape=(640, 640), color=(0, 0, 0)):
shape = im.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - \
new_unpad[1] # wh padding
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
im = cv2.copyMakeBorder(im, top, bottom, left, right,
cv2.BORDER_CONSTANT, value=color) # add border
return im, ratio, (dw, dh)
def myFunc(rknn_lite, IMG):
img = cv2.cvtColor(IMG, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
outputs = rknn_lite.inference(inputs=[img])
input0_data = outputs[0]
input1_data = outputs[1]
input2_data = outputs[2]
input0_data = input0_data.reshape([3, -1]+list(input0_data.shape[-2:]))
input1_data = input1_data.reshape([3, -1]+list(input1_data.shape[-2:]))
input2_data = input2_data.reshape([3, -1]+list(input2_data.shape[-2:]))
input_data = list()
input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))
boxes, classes, scores = yolov5_post_process(input_data)
img_1 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
if boxes is not None:
draw(img_1, boxes, scores, classes)
return img_1