这里yolo输出结果是一个13 * 13、26 * 26、52*52三种规格的网格,并且每个网格都有3个anchor box用来处理重叠。 并且种类有80种。
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import cv2
from IPython.display import Image, display
from yolo_utils import read_classes, read_anchors, yolo_head, preprocess_image, generate_colors, draw_outputs
def yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold=.6):
"""
过滤掉那些概率低的边框
:param box_confidence: 装载着每个边框的pc [13, 13, 3, 1]
:param boxes:装载着每个边框的坐标 [13, 13, 3, 4]
:param box_class_probs:装载着每个边框的80个种类的概率 [13, 13, 3, 80]
:param threshold:阈值,低于这个值的边框会被过滤掉
:return:
scores: 装载着保留下的边框的概率
boxes: 装载着保留下的边框的坐标
classes: 装载着保留下的那些边框的种类索引
"""
# 将pc与c相乘,得到具体某个种类是否存在的概率(置信度),假设某个边界框的 Objectness Score(即PC) 为 0.8,
# 表示模型认为该框中有 80% 的可能性存在某个物体;而对应的某个类别的 Class Probability 为 0.7,
# 那么总的置信度为:Total Confidence=0.8×0.7=0.56 即这个边界框的总置信度分数为 0.56。
box_scores = box_confidence * box_class_probs # [13, 13, 3, 80]
# 获取概率最大的那个种类的索引 (-1代表沿着最后一个维度,当然最大概率相同的话,则有多个)
box_classes = tf.argmax(box_scores, axis=-1) # [13, 13, 3]
# 获取概率最大的那个种类的概率值
box_class_scores = tf.reduce_max(box_scores, axis=-1) # [13, 13, 3]
# 创建一个过滤器,当某个种类的概率值大于等于阈值threshold时,对应于这个种类的filtering_mask中的位置就是true,否则就是false。故filtering_mask就是
# [False, true, False....]这种形式
filtering_mask = tf.greater_equal(box_class_scores, threshold) # [13, 13, 3]
# 用上面的过滤器来过滤掉那些小概率的边框,过滤完成后scores、boxes、classes就只装载了概率大的边框的的概率值和坐标以及种类索引了
scores = tf.boolean_mask(box_class_scores, filtering_mask) # 格子中,3个anchor,取概率大于0.6的,展开
boxes = tf.boolean_mask(boxes, filtering_mask) # 格子中,3个anchor,取概率大于0.6的,展开
classes = tf.boolean_mask(box_classes, filtering_mask) # 格子中,3个anchor,取概率大于0.6的,展开
return scores, boxes, classes
def yolo_non_max_suppression(scores, boxes, classes, max_boxes=20, iou_threshold=0.5):
"""
非最大值值抑制技术过滤掉重叠的边框
:param scores: 已经过滤过的 各个框大于概率大于0.6的框展开的概率
:param boxes:已经过滤过的 各个框大于概率大于0.6的框展开的坐标
:param classes:已经过滤过的 各个框大于概率大于0.6的框展开的索引
:param max_boxes: 最多想要保留多少个框
:param iou_threshold: 交并比,阈值,大于这个阈值的边框才会被非最大值抑制处理
:return:
scores-NMS后保留的那些边框的概率值
boxes-NMS保留下的那些边框的坐标
classes --NMS保留下的那些边框的种类索引
"""
# IOU两处使用场景
# - 训练阶段:IOU 主要用于正负样本匹配和辅助损失函数计算,用来比较预测值和真实标签。
# - 推理阶段:NMS 通过计算多个预测框之间的 IOU 来去除重叠的框。
# 会返回NMS后保留下来的边框索引
nms_indices = tf.image.non_max_suppression(boxes, scores, max_boxes, iou_threshold=iou_threshold)
# 通过上面的索引来分别获取被保留下来的边框的相关概率值、坐标以及种类索引
scores = tf.gather(scores, nms_indices)
boxes = tf.gather(boxes, nms_indices)
classes = tf.gather(classes, nms_indices)
return scores, boxes, classes
def yolo_eval(outputs, max_boxes=50, score_threshold=.5, iou_threshold=.6):
"""
过滤多余边框
:param outputs: YOLO模型结果
:param max_boxes: 最多识别出的边框
:param score_threshold: 概率值阈值
:param iou_threshold: 交并比阈值(用于推理结果的NMS)
:return:
scores -- 最终保留下那些边框的概率值
boxes--最终保留下的那些边框的坐标
classes-- 最终保留下的那些边框的种类的索引
"""
s, b, c = [], [], []
# 后续调用yolov3时,使用了3个规格的网格(13*13, 26*26, 52*52)进行预测,所以有3组output
for output in outputs:
# YOLO的输出结果分成3分:概率值、坐标、种类索引
box_confidence, boxes, box_class_probs = output
scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold=score_threshold)
s.append(scores)
b.append(boxes)
c.append(classes)
# 将3组output结果整合到一起
scores = tf.concat(s, axis=0)
boxes = tf.concat(b, axis=0)
classes = tf.concat(c, axis=0)
# NMS
scores, boxes, classes = yolo_non_max_suppression(scores, boxes, classes, max_boxes=max_boxes,
iou_threshold=iou_threshold)
return scores, boxes, classes
def test3():
yolo_output = (tf.random.normal([13, 13, 3, 1], mean=1, stddev=4, seed=1),
tf.random.normal([13, 13, 3, 4], mean=1, stddev=4, seed=1),
tf.random.normal([13, 13, 3, 80], mean=1, stddev=4, seed=1))
yolo_output1 = (tf.random.normal([26, 26, 3, 1], mean=1, stddev=4, seed=2),
tf.random.normal([26, 26, 3, 4], mean=1, stddev=4, seed=2),
tf.random.normal([26, 26, 3, 80], mean=1, stddev=4, seed=2))
yolo_output2 = (tf.random.normal([52, 52, 3, 1], mean=1, stddev=4, seed=3),
tf.random.normal([52, 52, 3, 4], mean=1, stddev=4, seed=3),
tf.random.normal([52, 52, 3, 80], mean=1, stddev=4, seed=3))
yolo_outputs = (yolo_output, yolo_output1, yolo_output2)
scores, boxes, classes = yolo_eval(yolo_outputs)
print("scores[2] = ", scores[2])
print("boxes[2] = ", boxes[2])
print("classes[2] = ", classes[2])
print("scores.shape = ", scores.shape)
print("boxes.shape = ", boxes.shape)
print("classes.shape = ", classes.shape)
def test1():
box_confidence = tf.random.normal([13, 13, 3, 1], mean=1, stddev=4, seed=1)
boxes = tf.random.normal([13, 13, 3, 4], mean=1, stddev=4, seed=1)
box_class_probs = tf.random.normal([13, 13, 3, 80], mean=1, stddev=4, seed=1)
scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold=0.5)
print("scores[2] = ", scores[2])
print("boxes[2] = ", boxes[2])
print("classes[2] = ", classes[2])
print("scores.shape = ", scores.shape)
print("boxes.shape = ", boxes.shape)
print("classes.shape = ", classes.shape)
def test2():
scores = tf.random.normal([54, ], mean=1, stddev=4, seed=1)
boxes = tf.random.normal([54, 4], mean=1, stddev=4, seed=1)
classes = tf.random.normal([54, ], mean=1, stddev=4, seed=1)
scores, boxes, classes = yolo_non_max_suppression(scores, boxes, classes)
print("scores[2] = ", scores[2])
print("boxes[2] = ", boxes[2])
print("classes[2] = ", classes[2])
print("scores.shape = ", scores.shape)
print("boxes.shape = ", boxes.shape)
print("classes.shape = ", classes.shape)
def img_show(img_file_path, out_scores, out_boxes, out_classes, class_names):
img_raw = tf.image.decode_image(open(img_file_path, 'rb').read(), channels=3)
img = cv2.cvtColor(img_raw._numpy(), cv2.COLOR_RGB2BGR)
colors = generate_colors(class_names)
print("Found {} boxes for {}".format(len(out_boxes), img_file_path))
img = draw_outputs(img, out_scores, out_boxes, out_classes, colors, class_names)
# display(Image(data=bytes(cv2.imencode(".jpg", img)[1]), width=800))
# file_name = [x for x in img_file_path.split(".")]
# cv2.imwrite('./out/' + file_name[0] + '_out.' + file_name[1], img)
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
return img
def predict(model, img_file_path, anchors, class_names):
img_raw = tf.image.decode_image(open(img_file_path, 'rb').read(), channels=3)
img = tf.expand_dims(img_raw, 0)
img = tf.image.resize(img, (416, 416)) / 255.
yolo_outputs = model(img)
outputs = yolo_head(yolo_outputs, anchors, len(class_names))
out_scores, out_boxes, out_classes = yolo_eval(outputs)
img = img_show(img_file_path, out_scores, out_boxes, out_classes, class_names)
plt.imshow(img)
plt.show()
def predict_frame(model, frame, anchors, class_names):
img = tf.expand_dims(frame, 0)
img = tf.image.resize(img, (416, 416)) / 255.
yolo_outputs = model(img)
outputs = yolo_head(yolo_outputs, anchors, len(class_names))
out_scores, out_boxes, out_classes = yolo_eval(outputs)
img = img_show_frame(frame, out_scores, out_boxes, out_classes, class_names)
return img
def img_show_frame(frame, out_scores, out_boxes, out_classes, class_names):
img_raw = tf.constant(frame)
img = cv2.cvtColor(img_raw._numpy(), cv2.COLOR_RGB2BGR)
colors = generate_colors(class_names)
img = draw_outputs(img, out_scores, out_boxes, out_classes, colors, class_names)
# display(Image(data=bytes(cv2.imencode(".jpg", img)[1]), width=800))
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
return img
def predict_video(model, video_file, anchors, class_names):
# 打开视频文件
cap = cv2.VideoCapture(video_file)
# 获取视频的宽高和帧率
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
# 输出文件配置
fourcc = cv2.VideoWriter.fourcc(*'mp4v') # 使用mp4编码格式
output_video = cv2.VideoWriter('output_video.mp4', fourcc, fps, (width, height))
#
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print("total frames:{}", total_frames)
i = 0
# 逐帧处理视频
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
i = i + 1
# 调用模型
result = predict_frame(model, frame, anchors, class_names)
print(f"deal frame_{i}/{total_frames}")
cv2.imshow('Frame', result)
# 写入视频
output_video.write(result)
# # 显示处理后的帧(可选)
# cv2.imshow('Frame', result)
# if cv2.waitKey(1) & 0xFF == ord('q'):
# break
# 释放资源
cap.release()
output_video.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
# test1()
# test2()
# test3()
class_names = read_classes("model_data/coco_classes.txt")
anchors = read_anchors("model_data/yolo_anchors.txt")
# 加载模型
yolo_model = tf.keras.models.load_model('model_data/yolo_model.h5')
# yolo_model.summary()
# img = predict(yolo_model, '/tmp/images/0070.jpg', anchors,class_names)
predict_video(yolo_model, '/tmp/424_1729159656.mp4', anchors, class_names)
yolo_utils
import tensorflow as tf
import numpy as np
import cv2
import colorsys
import random
from IPython.display import Image, display
# As tensorflow lite doesn't support tf.size used in tf.meshgrid,
# we reimplemented a simple meshgrid function that use basic tf function.
def _meshgrid(n_a, n_b):
return [
tf.reshape(tf.tile(tf.range(n_a), [n_b]), (n_b, n_a)),
tf.reshape(tf.repeat(tf.range(n_b), n_a), (n_b, n_a))
]
def yolo_head(preds, anchors, classes): # preds中有3个pred,分别对应13*13, 26*26, 52*52 3种不同的网格
# pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...classes))
outputs = {}
for i in range(3):
pred = preds[i]
grid_size = tf.shape(pred)[1:3]
box_xy, box_wh, objectness, class_probs = tf.split(
pred, (2, 2, 1, classes), axis=-1)
box_xy = tf.sigmoid(box_xy)
objectness = tf.sigmoid(objectness)
class_probs = tf.sigmoid(class_probs)
# pred_box = tf.concat((box_xy, box_wh), axis=-1) # original xywh for loss
# !!! grid[x][y] == (y, x)
grid = _meshgrid(grid_size[1], grid_size[0])
grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2) # [gx, gy, 1, 2]
box_xy = (box_xy + tf.cast(grid, tf.float32)) / \
tf.cast(grid_size, tf.float32)
# 坐标 (x, y) 是相对于 网格 cell 的比例,表示物体中心相对位置。
# 宽高 (w, h) 是相对于 anchor box 的比例,网络输出的是 log 变换后的偏移量,预测的 log 值需要还原成比例,然后乘以 anchor 的宽高得到实际尺寸。
# 而这里的anchors的索引6、7、8去作差,实际对应了:
# anchors[6], anchors[7], anchors[8],i=0时,用于最深的层,检测大物体。
# anchors[3], anchors[4], anchors[5],i=1时,用于最深的层,检测中等物体。
# anchors[0], anchors[1], anchors[2],i=2时,用于最深的层,检测小物体。
box_wh = tf.exp(box_wh) * anchors[[6 - i * 3, 7 - i * 3, 8 - i * 3]]
box_x1y1 = box_xy - box_wh / 2
box_x2y2 = box_xy + box_wh / 2
bbox = tf.concat([box_x1y1, box_x2y2], axis=-1)
outputs['output' + str(i)] = (objectness, bbox, class_probs)
return (outputs['output0'], outputs['output1'], outputs['output2'])
def read_classes(classes_path):
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
def read_anchors(anchors_path):
with open(anchors_path) as f:
anchors = f.readline()
anchors = [float(x) for x in anchors.split()]
anchors = np.array(anchors).reshape(-1, 2)
return anchors
def generate_colors(class_names):
hsv_tuples = [(x / len(class_names), 1., 1.) for x in range(len(class_names))]
colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))
random.seed(10201) # Fixed seed for consistent colors across runs.
random.shuffle(colors) # Shuffle colors to decorrelate adjacent classes.
random.seed(None) # Reset seed to default.
return colors
def preprocess_image(img_path, model_image_size):
# return img_raw, img
pass
def draw_outputs(img, out_scores, out_boxes, out_classes, colors, class_names):
wh = np.flip(img.shape[0:2])
for i, c in list(enumerate(out_classes)):
x1y1 = tuple((np.array(out_boxes[i][0:2]) * wh).astype(np.int32))
x2y2 = tuple((np.array(out_boxes[i][2:4]) * wh).astype(np.int32))
x1y1_lable = tuple((np.array(out_boxes[i][0:2]) * wh + [0, -15]).astype(np.int32))
x2y2_lable = tuple(
(np.array(out_boxes[i][0:2]) * wh + [(len(class_names[int(out_classes[i])]) + 6) * 12, 0]).astype(np.int32))
img = cv2.rectangle(img, x1y1, x2y2, colors[c], 2)
img = cv2.rectangle(img, x1y1_lable, x2y2_lable, colors[c], -1)
img = cv2.putText(img, '{} {:.2f}'.format(
class_names[int(out_classes[i])], out_scores[i]),
x1y1, cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 0), 1)
print('{} {:.2f}'.format(class_names[int(out_classes[i])], out_scores[i]),
x1y1, x2y2)
return img
9348

被折叠的 条评论
为什么被折叠?



