yolov3目标检测

这里yolo输出结果是一个13 * 1326 * 2652*52三种规格的网格,并且每个网格都有3个anchor box用来处理重叠。 并且种类有80种。

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import cv2
from IPython.display import Image, display
from yolo_utils import read_classes, read_anchors, yolo_head, preprocess_image, generate_colors, draw_outputs


def yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold=.6):
    """
    过滤掉那些概率低的边框
    :param box_confidence: 装载着每个边框的pc [13, 13, 3, 1]
    :param boxes:装载着每个边框的坐标 [13, 13, 3, 4]
    :param box_class_probs:装载着每个边框的80个种类的概率 [13, 13, 3, 80]
    :param threshold:阈值,低于这个值的边框会被过滤掉
    :return:
    scores: 装载着保留下的边框的概率
    boxes: 装载着保留下的边框的坐标
    classes: 装载着保留下的那些边框的种类索引
    """
    # 将pc与c相乘,得到具体某个种类是否存在的概率(置信度),假设某个边界框的 Objectness Score(即PC) 为 0.8,
    # 表示模型认为该框中有 80% 的可能性存在某个物体;而对应的某个类别的 Class Probability 为 0.7,
    # 那么总的置信度为:Total Confidence=0.8×0.7=0.56 即这个边界框的总置信度分数为 0.56。
    box_scores = box_confidence * box_class_probs  # [13, 13, 3, 80]
    # 获取概率最大的那个种类的索引 (-1代表沿着最后一个维度,当然最大概率相同的话,则有多个)
    box_classes = tf.argmax(box_scores, axis=-1)  # [13, 13, 3]
    # 获取概率最大的那个种类的概率值
    box_class_scores = tf.reduce_max(box_scores, axis=-1)  # [13, 13, 3]

    # 创建一个过滤器,当某个种类的概率值大于等于阈值threshold时,对应于这个种类的filtering_mask中的位置就是true,否则就是false。故filtering_mask就是
    # [False, true, False....]这种形式
    filtering_mask = tf.greater_equal(box_class_scores, threshold)  # [13, 13, 3]
    # 用上面的过滤器来过滤掉那些小概率的边框,过滤完成后scores、boxes、classes就只装载了概率大的边框的的概率值和坐标以及种类索引了
    scores = tf.boolean_mask(box_class_scores, filtering_mask)  # 格子中,3个anchor,取概率大于0.6的,展开
    boxes = tf.boolean_mask(boxes, filtering_mask)  # 格子中,3个anchor,取概率大于0.6的,展开
    classes = tf.boolean_mask(box_classes, filtering_mask)  # 格子中,3个anchor,取概率大于0.6的,展开
    return scores, boxes, classes


def yolo_non_max_suppression(scores, boxes, classes, max_boxes=20, iou_threshold=0.5):
    """
    非最大值值抑制技术过滤掉重叠的边框
    :param scores: 已经过滤过的 各个框大于概率大于0.6的框展开的概率
    :param boxes:已经过滤过的 各个框大于概率大于0.6的框展开的坐标
    :param classes:已经过滤过的 各个框大于概率大于0.6的框展开的索引
    :param max_boxes: 最多想要保留多少个框
    :param iou_threshold: 交并比,阈值,大于这个阈值的边框才会被非最大值抑制处理
    :return:
    scores-NMS后保留的那些边框的概率值
    boxes-NMS保留下的那些边框的坐标
    classes --NMS保留下的那些边框的种类索引
    """
    # IOU两处使用场景
    # - 训练阶段:IOU 主要用于正负样本匹配和辅助损失函数计算,用来比较预测值和真实标签。
    # - 推理阶段:NMS 通过计算多个预测框之间的 IOU 来去除重叠的框。
    # 会返回NMS后保留下来的边框索引
    nms_indices = tf.image.non_max_suppression(boxes, scores, max_boxes, iou_threshold=iou_threshold)
    # 通过上面的索引来分别获取被保留下来的边框的相关概率值、坐标以及种类索引
    scores = tf.gather(scores, nms_indices)
    boxes = tf.gather(boxes, nms_indices)
    classes = tf.gather(classes, nms_indices)
    return scores, boxes, classes


def yolo_eval(outputs, max_boxes=50, score_threshold=.5, iou_threshold=.6):
    """
    过滤多余边框
    :param outputs:  YOLO模型结果
    :param max_boxes: 最多识别出的边框
    :param score_threshold: 概率值阈值
    :param iou_threshold:  交并比阈值(用于推理结果的NMS)
    :return:
    scores -- 最终保留下那些边框的概率值
    boxes--最终保留下的那些边框的坐标
    classes-- 最终保留下的那些边框的种类的索引
    """
    s, b, c = [], [], []
    # 后续调用yolov3时,使用了3个规格的网格(13*13, 26*26, 52*52)进行预测,所以有3组output
    for output in outputs:
        # YOLO的输出结果分成3分:概率值、坐标、种类索引
        box_confidence, boxes, box_class_probs = output
        scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold=score_threshold)
        s.append(scores)
        b.append(boxes)
        c.append(classes)
    # 将3组output结果整合到一起
    scores = tf.concat(s, axis=0)
    boxes = tf.concat(b, axis=0)
    classes = tf.concat(c, axis=0)

    #  NMS
    scores, boxes, classes = yolo_non_max_suppression(scores, boxes, classes, max_boxes=max_boxes,
                                                      iou_threshold=iou_threshold)
    return scores, boxes, classes


def test3():
    yolo_output = (tf.random.normal([13, 13, 3, 1], mean=1, stddev=4, seed=1),
                   tf.random.normal([13, 13, 3, 4], mean=1, stddev=4, seed=1),
                   tf.random.normal([13, 13, 3, 80], mean=1, stddev=4, seed=1))
    yolo_output1 = (tf.random.normal([26, 26, 3, 1], mean=1, stddev=4, seed=2),
                    tf.random.normal([26, 26, 3, 4], mean=1, stddev=4, seed=2),
                    tf.random.normal([26, 26, 3, 80], mean=1, stddev=4, seed=2))
    yolo_output2 = (tf.random.normal([52, 52, 3, 1], mean=1, stddev=4, seed=3),
                    tf.random.normal([52, 52, 3, 4], mean=1, stddev=4, seed=3),
                    tf.random.normal([52, 52, 3, 80], mean=1, stddev=4, seed=3))
    yolo_outputs = (yolo_output, yolo_output1, yolo_output2)
    scores, boxes, classes = yolo_eval(yolo_outputs)
    print("scores[2] = ", scores[2])
    print("boxes[2] = ", boxes[2])
    print("classes[2] = ", classes[2])
    print("scores.shape = ", scores.shape)
    print("boxes.shape = ", boxes.shape)
    print("classes.shape = ", classes.shape)


def test1():
    box_confidence = tf.random.normal([13, 13, 3, 1], mean=1, stddev=4, seed=1)
    boxes = tf.random.normal([13, 13, 3, 4], mean=1, stddev=4, seed=1)
    box_class_probs = tf.random.normal([13, 13, 3, 80], mean=1, stddev=4, seed=1)
    scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold=0.5)
    print("scores[2] = ", scores[2])
    print("boxes[2] = ", boxes[2])
    print("classes[2] = ", classes[2])
    print("scores.shape = ", scores.shape)
    print("boxes.shape = ", boxes.shape)
    print("classes.shape = ", classes.shape)


def test2():
    scores = tf.random.normal([54, ], mean=1, stddev=4, seed=1)
    boxes = tf.random.normal([54, 4], mean=1, stddev=4, seed=1)
    classes = tf.random.normal([54, ], mean=1, stddev=4, seed=1)
    scores, boxes, classes = yolo_non_max_suppression(scores, boxes, classes)
    print("scores[2] = ", scores[2])
    print("boxes[2] = ", boxes[2])
    print("classes[2] = ", classes[2])
    print("scores.shape = ", scores.shape)
    print("boxes.shape = ", boxes.shape)
    print("classes.shape = ", classes.shape)


def img_show(img_file_path, out_scores, out_boxes, out_classes, class_names):
    img_raw = tf.image.decode_image(open(img_file_path, 'rb').read(), channels=3)
    img = cv2.cvtColor(img_raw._numpy(), cv2.COLOR_RGB2BGR)
    colors = generate_colors(class_names)
    print("Found {} boxes for {}".format(len(out_boxes), img_file_path))
    img = draw_outputs(img, out_scores, out_boxes, out_classes, colors, class_names)
    # display(Image(data=bytes(cv2.imencode(".jpg", img)[1]), width=800))
    # file_name = [x for x in img_file_path.split(".")]
    # cv2.imwrite('./out/' + file_name[0] + '_out.' + file_name[1], img)
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    return img


def predict(model, img_file_path, anchors, class_names):
    img_raw = tf.image.decode_image(open(img_file_path, 'rb').read(), channels=3)
    img = tf.expand_dims(img_raw, 0)
    img = tf.image.resize(img, (416, 416)) / 255.
    yolo_outputs = model(img)
    outputs = yolo_head(yolo_outputs, anchors, len(class_names))
    out_scores, out_boxes, out_classes = yolo_eval(outputs)
    img = img_show(img_file_path, out_scores, out_boxes, out_classes, class_names)
    plt.imshow(img)
    plt.show()


def predict_frame(model, frame, anchors, class_names):
    img = tf.expand_dims(frame, 0)
    img = tf.image.resize(img, (416, 416)) / 255.
    yolo_outputs = model(img)
    outputs = yolo_head(yolo_outputs, anchors, len(class_names))
    out_scores, out_boxes, out_classes = yolo_eval(outputs)
    img = img_show_frame(frame, out_scores, out_boxes, out_classes, class_names)
    return img


def img_show_frame(frame, out_scores, out_boxes, out_classes, class_names):
    img_raw = tf.constant(frame)
    img = cv2.cvtColor(img_raw._numpy(), cv2.COLOR_RGB2BGR)
    colors = generate_colors(class_names)
    img = draw_outputs(img, out_scores, out_boxes, out_classes, colors, class_names)
    # display(Image(data=bytes(cv2.imencode(".jpg", img)[1]), width=800))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    return img


def predict_video(model, video_file, anchors, class_names):
    # 打开视频文件
    cap = cv2.VideoCapture(video_file)
    # 获取视频的宽高和帧率
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    # 输出文件配置
    fourcc = cv2.VideoWriter.fourcc(*'mp4v')  # 使用mp4编码格式
    output_video = cv2.VideoWriter('output_video.mp4', fourcc, fps, (width, height))
    #
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print("total frames:{}", total_frames)
    i = 0
    # 逐帧处理视频
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        i = i + 1
        # 调用模型
        result = predict_frame(model, frame, anchors, class_names)
        print(f"deal frame_{i}/{total_frames}")
        cv2.imshow('Frame', result)
        # 写入视频
        output_video.write(result)

        # # 显示处理后的帧(可选)
        # cv2.imshow('Frame', result)
        # if cv2.waitKey(1) & 0xFF == ord('q'):
        #     break
    # 释放资源
    cap.release()
    output_video.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    # test1()
    # test2()
    # test3()

    class_names = read_classes("model_data/coco_classes.txt")
    anchors = read_anchors("model_data/yolo_anchors.txt")
    # 加载模型
    yolo_model = tf.keras.models.load_model('model_data/yolo_model.h5')
    # yolo_model.summary()
    # img = predict(yolo_model, '/tmp/images/0070.jpg', anchors,class_names)
    predict_video(yolo_model, '/tmp/424_1729159656.mp4', anchors, class_names)

yolo_utils

import tensorflow as tf
import numpy as np
import cv2
import colorsys
import random
from IPython.display import Image, display


# As tensorflow lite doesn't support tf.size used in tf.meshgrid,
# we reimplemented a simple meshgrid function that use basic tf function.
def _meshgrid(n_a, n_b):
    return [
        tf.reshape(tf.tile(tf.range(n_a), [n_b]), (n_b, n_a)),
        tf.reshape(tf.repeat(tf.range(n_b), n_a), (n_b, n_a))
    ]


def yolo_head(preds, anchors, classes):  # preds中有3个pred,分别对应13*13, 26*26, 52*52 3种不同的网格
    # pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...classes))
    outputs = {}

    for i in range(3):
        pred = preds[i]
        grid_size = tf.shape(pred)[1:3]
        box_xy, box_wh, objectness, class_probs = tf.split(
            pred, (2, 2, 1, classes), axis=-1)

        box_xy = tf.sigmoid(box_xy)
        objectness = tf.sigmoid(objectness)
        class_probs = tf.sigmoid(class_probs)
        # pred_box = tf.concat((box_xy, box_wh), axis=-1)  # original xywh for loss

        # !!! grid[x][y] == (y, x)
        grid = _meshgrid(grid_size[1], grid_size[0])
        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)  # [gx, gy, 1, 2]

        box_xy = (box_xy + tf.cast(grid, tf.float32)) / \
                 tf.cast(grid_size, tf.float32)

        # 坐标 (x, y) 是相对于 网格 cell 的比例,表示物体中心相对位置。
        # 宽高 (w, h) 是相对于 anchor box 的比例,网络输出的是 log 变换后的偏移量,预测的 log 值需要还原成比例,然后乘以 anchor 的宽高得到实际尺寸。

        # 而这里的anchors的索引6、7、8去作差,实际对应了:
        # anchors[6], anchors[7], anchors[8],i=0时,用于最深的层,检测大物体。
        # anchors[3], anchors[4], anchors[5],i=1时,用于最深的层,检测中等物体。
        # anchors[0], anchors[1], anchors[2],i=2时,用于最深的层,检测小物体。
        box_wh = tf.exp(box_wh) * anchors[[6 - i * 3, 7 - i * 3, 8 - i * 3]]

        box_x1y1 = box_xy - box_wh / 2
        box_x2y2 = box_xy + box_wh / 2
        bbox = tf.concat([box_x1y1, box_x2y2], axis=-1)

        outputs['output' + str(i)] = (objectness, bbox, class_probs)

    return (outputs['output0'], outputs['output1'], outputs['output2'])


def read_classes(classes_path):
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names


def read_anchors(anchors_path):
    with open(anchors_path) as f:
        anchors = f.readline()
        anchors = [float(x) for x in anchors.split()]
        anchors = np.array(anchors).reshape(-1, 2)
    return anchors


def generate_colors(class_names):
    hsv_tuples = [(x / len(class_names), 1., 1.) for x in range(len(class_names))]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))
    random.seed(10201)  # Fixed seed for consistent colors across runs.
    random.shuffle(colors)  # Shuffle colors to decorrelate adjacent classes.
    random.seed(None)  # Reset seed to default.
    return colors


def preprocess_image(img_path, model_image_size):

    # return img_raw, img
    pass


def draw_outputs(img, out_scores, out_boxes, out_classes, colors, class_names):
    wh = np.flip(img.shape[0:2])
    for i, c in list(enumerate(out_classes)):
        x1y1 = tuple((np.array(out_boxes[i][0:2]) * wh).astype(np.int32))
        x2y2 = tuple((np.array(out_boxes[i][2:4]) * wh).astype(np.int32))
        x1y1_lable = tuple((np.array(out_boxes[i][0:2]) * wh + [0, -15]).astype(np.int32))
        x2y2_lable = tuple(
            (np.array(out_boxes[i][0:2]) * wh + [(len(class_names[int(out_classes[i])]) + 6) * 12, 0]).astype(np.int32))
        img = cv2.rectangle(img, x1y1, x2y2, colors[c], 2)
        img = cv2.rectangle(img, x1y1_lable, x2y2_lable, colors[c], -1)
        img = cv2.putText(img, '{} {:.2f}'.format(
            class_names[int(out_classes[i])], out_scores[i]),
                          x1y1, cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 0), 1)
        print('{} {:.2f}'.format(class_names[int(out_classes[i])], out_scores[i]),
              x1y1, x2y2)
    return img

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值