yolov1已经出来很久了,起初公布的代码作者采用的是自己实现的网络框架darknet,该网络框架是基于c语言实现。尽管现在有着各种各样的框架,不过个人感觉tensorflow在使用方面依旧有着很强的优势,主要体现在扩展性方面。不过因人而异吧,毕竟只是一个工具而已。网上其他博客也有对这份代码的讲解,不过个人感觉依旧有些不到位,所以决定还是结合自己的理解讲解下。下面的代码主要是基于tensorflow实现的yolov1,基本理论可以查看:https://blog.csdn.net/yongjiankuang/article/details/71057288。
首先是配置文件config.py,主要包含了网络的一些基本参数信息
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 28 22:31:05 2018
@author: Administrator
config.py
"""
import os
DATA_PATH = 'data'
PASCAL_PATH = os.path.join(DATA_PATH,'pascal_voc')
CACHE_PATH = os.path.join(PASCAL_PATH,'cache')
OUTPUT_DIR = os.path.join(PASCAL_PATH,'output')
WEIGHTS_DIR = os.path.join(PASCAL_PATH,'weights')
WEIGHTS_FILE = None
CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
'train', 'tvmonitor']
FLIPPED = True
#model parameter
IMAGE_SIZE = 448 #输入图片设置大小
CELL_SIZE = 7
BOXES_PER_CELL = 2
ALPHA = 0.1
DISP_CONSOLE = False
OBJECT_SCALE = 1.0 #目标物体的代价因子
NOOBJECT_SCALE= 1.0 #非目标物体的代价因子
CLASS_SCALE = 2.0 #类别的代价因子
COORD_SCALE = 5.0 #坐标的代价因子
#
# solver parameter
#
GPU = False
LEARNING_RATE = 0.0001
DECAY_STEPS = 30000
DECAY_RATE = 0.1
STAIRCASE = True
BATCH_SIZE = 45
MAX_ITER = 15000
SUMMARY_ITER = 10
SAVE_ITER = 1000
#
# test parameter
#
THRESHOLD = 0.2
IOU_THRESHOLD = 0.5
yolo_net.py主要是网络结构的声明
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 28 22:32:26 2018
@author: kuangyongjian
"""
import numpy as np
import tensorflow as tf
import yolo.config as cfg
slim = tf.contrib.slim
class YOLONet(object):
def __init__(self, is_training=True):
self.classes = cfg.CLASSES #目标类别
self.num_class = len(self.classes) #类别数目
self.image_size = cfg.IMAGE_SIZE #输入图像的大小
self.cell_size = cfg.CELL_SIZE #cell的大小
self.boxes_per_cell = cfg.BOXES_PER_CELL #每个cell负责的box数目
self.output_size = (self.cell_size * self.cell_size) * \
(self.num_class + self.boxes_per_cell * 5) #输出数据的维度:7*7*(20 + 2 * 5) = 1470
self.scale = 1.0 * self.image_size / self.cell_size #缩放比
#7*7个cell属于20个物体类别的概率 + 98个box 边界
self.boundary1 = self.cell_size * self.cell_size * self.num_class
self.boundary2 = self.boundary1 +\
self.cell_size * self.cell_size * self.boxes_per_cell
self.object_scale = cfg.OBJECT_SCALE #值为1,存在目标的因子
self.noobject_scale = cfg.NOOBJECT_SCALE #值为1,不存在目标的因子
self.class_scale = cfg.CLASS_SCALE #类别损失函数的因子
self.coord_scale = cfg.COORD_SCALE #坐标损失函数的因子
self.learning_rate = cfg.LEARNING_RATE #学习速率
self.batch_size = cfg.BATCH_SIZE #批次大小
self.alpha = cfg.ALPHA #alpha
#[2,7,7] -> [7,7,2]
self.offset = np.transpose(np.reshape(np.array(
[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
(self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))
tf.reset_default_graph()
#输入变量 448x448x3
self.images = tf.placeholder(
tf.float32, [None, self.image_size, self.image_size, 3],
name='images')
#构建网络图,返回预测结果
self.logits = self.build_network(self.images, num_outputs=self.output_size, alpha=self.alpha,is_training=is_training)
if is_training:
#训练时,实际标签的维度为25
self.labels = tf.placeholder(
tf.float32,
[None, self.cell_size, self.cell_size, 5 + self.num_class])
self.loss_layer(self.logits, self.labels)
self.total_loss = tf.losses.get_total_loss()
tf.summary.scalar('total_loss', self.total_loss)
#构造网络图
def build_network(self,
images,
num_outputs,
alpha,
keep_prob=0.5,
is_training=True,
scope='yolo'):
#tf.reset_default_graph()
with tf.variable_scope(scope):
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
activation_fn=leaky_relu(alpha),
weights_regularizer=slim.l2_regularizer(0.0005),
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
):
net = tf.pad(\
images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),
name='pad_1') #对输入数据的宽高进行填充,batch_size和channel不做填充
net = slim.conv2d(\
net, 64, 7, 2, padding='VALID', scope='conv_2') #conv:64个7x7的卷积核,以2为步伐进行滤波,out:224x224*64
net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_3') #pool:最大池化kernel=2,stride = 2,out:112x112x64
net = slim.conv2d(net,192,3,scope = 'conv_4') #conv: num_kernel = 192,kernel_size=3,out:112x112x192
net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_5') #pool: kernel_size = 2,stride = 2,out:56x56x192
net = slim.conv2d(net,128,1,scope = 'conv_6') #conv: num_kernel = 128,kernel_size = 1,out:56x56x128
net = slim.conv2d(net,256,3,scope = 'conv_7') #conv: num_kernel = 256,kernel_size = 3,out:56x56x256
net = slim.conv2d(net,256,1,scope = 'conv_8') #conv: num_kernel = 256,kernel_size = 1,out:56x56x256
net = slim.conv2d(net,512,3,scope = 'conv_9') #conv: num_kernel = 512,kernel_size = 3,out:56x56x512
net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_10') #pool: kernel_size = 2,stride = 2,out:28x28x512
net = slim.conv2d(net,256,1,scope = 'conv_11') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net,512,3,scope = 'conv_12') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net,256,1,scope = 'conv_13') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net,512,3,scope = 'conv_14') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net,256,1,scope = 'conv_15') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net,512,3,scope = 'conv_16') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net,256,1,scope = 'conv_17') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net,512,3,scope = 'conv_18') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net,512,1,scope = 'conv_19') #conv: num_kernel = 512,kernel_size = 1,out:28x28x512
net = slim.conv2d(net,1024,3,scope = 'conv_20') #conv: num_kernel = 1024,kernel_size = 3,out:28x28x1024
net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_21') #pool:kernel_size = 2,stride = 2,out: 14x14x1024
net = slim.conv2d(net,512,1,scope = 'conv_22') #conv: num_kernel = 512,kernel_size = 1,out: 14x14x512
net = slim.conv2d(net,1024,3,scope = 'conv_23') #conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
net = slim.conv2d(net,512,1,scope = 'conv_24') #conv: num_kernel = 512,kernel_size = 1,out: 14x14x512
net = slim.conv2d(net,1024,3,scope = 'conv_25') #conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
net = slim.conv2d(net,1024,3,scope = 'conv_26') #conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
net = tf.pad(net,np.array([[0,0],[1,1],[1,1],[0,0]]),name = 'pad_27') #对特征图进行填充
net = slim.conv2d(net,1024,3,2,padding = 'VALID',scope = 'conv_28') #conv: num_kernel = 1024, kernel_size = 3,stride = 2,out:7x7x1024
net = slim.conv2d(net,1024,3,scope = 'conv_29') #conv: num_kernel = 1024,kernel_size = 3,out:7x7x1024
net = slim.conv2d(net,1024,3,scope = 'conv_30') #conv: num_kernel = 1024,kernel_size = 3,out:7x7x1024
net = tf.transpose(net,[0,3,1,2],name = 'trans_31') #[batchsize channel 7 7]
net = slim.flatten(net,scope = 'flat_32') #
net = slim.fully_connected(net,512,scope = 'fc_33') #1x512
net = slim.fully_connected(net,4096,scope = 'fc_34') #1x4096
net = slim.dropout(
net,keep_prob = keep_prob,is_training = is_training,
scope = 'dropout_35') #dropout
net = slim.fully_connected(net,num_outputs,activation_fn = None,scope = 'fc_36') #1470
return net
def calc_iou(self,boxes1,boxes2,scope = 'iou'):
with tf.variable_scope(scope):
#transform (x_center,y_center,w,h) to (x1,y1,x2,y2)
boxes1_t = tf.stack([boxes1[...,0] - boxes1[...,2] / 2.0, #x - w / 2
boxes1[...,1] - boxes1[...,3] / 2.0, #y - h / 2
boxes1[...,0] + boxes1[...,2] / 2.0, #x + w / 2
boxes1[...,1] + boxes1[...,3] / 2.0], #y + h / 2
axis = -1)
boxes2_t = tf.stack([boxes2[...,0] - boxes2[...,2] / 2.0,
boxes2[...,1] - boxes2[...,3] / 2,
boxes2[...,0] + boxes2[...,2] / 2,
boxes2[...,1] + boxes2[...,3] / 2],
axis = -1)
#calculate the left up point and right down point
lu = tf.maximum(boxes1_t[...,:2],boxes2_t[...,:2])
rd = tf.minimum(boxes1_t[...,2:],boxes2_t[...,2:])
#intersection
intersection = tf.maximum(0,rd - lu)
inter_square = intersection[...,0] * intersection[...,1]
#calculate the boxs1 square and box2 square
square1 = boxes1[...,2] * boxes1[...,3]
square2 = boxes2[...,2] * boxes2[...,3]
#计算并集
union_square = tf.maximum(square1 + square2 - inter_square,1e-10)
return tf.clip_by_value(inter_square / union_square,0.0,1.0)
def loss_layer(self,predicts,labels,scope = 'loss_layer'):
with tf.variable_scope(scope):
predict_classes = tf.reshape(predicts[:,:self.boundary1],[self.batch_size,self.cell_size,self.cell_size,self.num_class])#网络输出端类别数据
predict_scales = tf.reshape(predicts[:,self.boundary1:self.boundary2],[self.batch_size,self.cell_size,self.cell_size,self.boxes_per_cell])#网络输出端置信度
predict_boxes = tf.reshape(predicts[:,self.boundary2:],[self.batch_size,self.cell_size,self.cell_size,self.boxes_per_cell,4])#网络输出端box数据
#label存储信息的格式:[response boxes classes]
response = tf.reshape(labels[...,0],[self.batch_size,self.cell_size,self.cell_size,1])#标签中置信度
boxes = tf.reshape(labels[...,1:5],[self.batch_size,self.cell_size,self.cell_size,1,4])#标签中box
boxes = tf.tile(boxes,[1,1,1,self.boxes_per_cell,1] / self.image_size)#将label中的box格式转换为与predict中box对应的格式
classes = labels[...,5:]#标签中的类别信息
offset = tf.reshape(tf.constant(self.offset,dtype = tf.float32),[1,self.cell_size,self.cell_size,self.boxes_per_cell])
offset = tf.tile(offset,[self.batch_size,1,1,1])#将offset复制batch_size份
offset_tran = tf.transpose(offset,(0,2,1,3))
predict_boxes_tran = tf.stack([(predict_boxes[...,0] + offset) / self.cell_size,
(predict_boxes[...,1] + offset_tran) / self.cell_size,
tf.square(predict_boxes[...,2]),
tf.square(predict_boxes[...,3])],axis = -1)
iou_predict_truth = self.calc_iou(predict_boxes_tran,boxes)
#calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
object_mask = tf.reduce_max(iou_predict_truth,3,keep_dims = True) #取第三个维度上的最大值,并保持第三个维度的大小
object_mask = tf.cast((iou_predict_truth >= object_mask),tf.float32) * response #表示包含目标物体的位置为1,其他位置为0
#calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
noobject_mask = tf.ones_like(object_mask,dtype = tf.float32) - object_mask #计算无目标物体的掩膜
boxes_tran = tf.stack([
boxes[...,0] * self.cell_size - offset,
boxes[...,1] * self.cell_size - offset_tran,
tf.sqrt(boxes[...,2]),
tf.sqrt(boxes[...,3])],axis = -1)
#class_loss 类别损失函数
class_delta = response * (predict_classes - classes) #有目标情况下,类别误差
class_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(class_delta),axis = [1,2,3]),name = 'class_loss') * self.class_scale
#object_loss 目标物体confidence的损失
object_delta = object_mask * (predict_scales - iou_predict_truth)
object_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(object_delta),axis = [1,2,3]),name = 'object_loss') * self.object_scale
#noobject_delta
noobject_delta = noobject_mask * predict_scales
noobject_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(noobject_delta),axis = [1,2,3]),
name = 'noobject_loss') * self.noobject_scale
#coord_loss
coord_mask = tf.expand_dims(object_mask,4) #先扩维
boxes_delta = coord_mask * (predict_boxes - boxes_tran)
coord_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(boxes_delta),axis = [1,2,3,4]),
name = 'coord_loss') * self.coord_scale
#total loss
tf.losses.add_loss(class_loss)
tf.losses.add_loss(object_loss)
tf.losses.add_loss(noobject_loss)
tf.losses.add_loss(coord_loss)
tf.summary.histogram('boxes_delta_x',boxes_delta[...,0])
tf.summary.histogram('boxes_delta_y',boxes_delta[...,1])
tf.summary.histogram('boxes_delta_w',boxes_delta[...,2])
tf.summary.histogram('boxes_delta_h',boxes_delta[...,3])
tf.summary.histogram('iou',iou_predict_truth)
#leaky 激活函数
def leaky_relu(alpha):
def op(inputs):
return tf.nn.leaky_relu(inputs,alpha = alpha,name = 'leaky_relu')
return op
pascal_voc.py主要是voc数据信息转换为网络训练可读数据
"""
pascal_voc:主要功能
对图像数据进行归一化,同时获取相应的标签数据
"""
import os
import xml.etree.ElementTree as ET
import numpy as np
import cv2
import pickle
import copy
import yolo.config as cfg
class pascal_voc(object):
def __init__(self,phase,rebuild = False):
self.devkil_path = os.path.join(cfg.PASCAL_PATH,'VOCdevkit')
self.data_path = os.path.join(self.devkil,'VOC2007')
self.cache_path = cfg.CACHE_PATH
self.batch_size = cfg.BATCH_SIZE
self.image_size = cfg.IMAGE_SIZE
self.cell_size = cfg.CELL_SIZE
self.classes = cfg.CLASSES
self.class_to_ind = dict(zip(self.classes,range(len(self.classes))))
self.flipped = cfg.FLIPPED
self.phase = phase
self.rebuild = rebuild
self.cursor = 0
self.epoch = 1
self.gt_labels = None
self.prepare()
#批量读取图片和图片的标签信息
def get(self):
images = np.zeros((self.batch_size,self.image_size,self.image_size,3))
labels = np.zeros((self.batch_size,self.cell_size,self.cell_size,25))
count = 0
while count < self.batch_size:
imname = self.gt_labels[self.cursor]['imname']
flipped = self.gt_labels[self.cursor]['flipped']
images[count,:,:,:] = self.image_read(imname,flipped)
labels[count,:,:,:] = self.gt_labels[self.cursor]['label']
count += 1
self.cursor += 1
if self.cursor >= len(self.gt_labels):
np.random.shuffle(self.gt_labels)
self.cursor = 0
self.epoch += 1
return images,labels
#对图片数据进行格式转换以及归一化等处理
def image_read(self,imname,flipped = False):
image = cv2.imread(imname)
image = cv2.resize(image,(self.image_size,self.image_size))
image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB).astype(np.float32)
image = (image / 255.0) * 2.0 - 1.0 #数据归一化到[-1,1]
if flippedd:
image = image[:,::-1,:]#对数据的x维度进行镜像处理
return image
#准备好数据的标签信息
def prepare(self):
gt_labels = self.load_labels()
#如果要做镜像处理,则相应的label信息做相应的处理
if self.flipped:
print('Appending horizontally-flipped training examples...')
gt_labels_cp = copy.deepcopy(gt_labels)
for idx in range(len(gt_labels_cp)):
gt_labels_cp[idx]['flipped'] = True
gt_labels_cp[idx]['label'] = gt_labels_cp[idx]['label'][:,::-1,:] #label信息水平方向进行镜像处理
for i in range(self.cell_size):
for j in range(self.cell_size):
if gt_labels_cp[idx]['label'][i,j,0] == 1:
gt_labels_cp[idx]['label'][i,j,1] = \
self.image_size - 1 - \
gt_labels_cp[idx]['label'][i,j,1]
gt_labels += gt_labels_cp
np.random.shuffle(gt_labels)#打乱数据信息
self.gt_labels = gt_labels
return gt_labels
#gt_label存储图片路径信息,标签信息,是否镜像信息
def load_labels(self):
cache_file = os.path.join(self.cache_path,'pascal_' + self.phase + '_gt_labels.pkl')
if os.path.isfile(cache_file) and not self.rebuild:
print('Loading gt_labels from: ' + cache_file)
with open(cache_file,'rb') as f:
gt_labels = pickle.load(f)
return gt_labels
print('processing gt_labels from: ' + self.data_path)
if not os.path.exists(self.cache_path):
os.makedirs(self.cache_path)
if self.phase == 'train':
txtname = os.path.join(self.data_path,'ImageSets','Main','trainval.txt')
else:
txtname = os.path.join(self.data_path,'ImageSets','Main','test.txt')
with open(txtname,'r') as f:
self.image_index = [x.strip() for x in f.readlines()]
gt_labels = []
for index in self.image_index:
label,num = self.load_pascal_annotation(index)
if num == 0:
continue
imname = os.path.join(self.data_path,'JPEGImages',index + '.jpg')
gt_labels.append({'imname':imname,
'label':label,
'flipped':False})
print('Saving gt_labels to:' + cache_file)
with open(cache_file,'wb') as f:
pickle.dump(gt_labels,f)
return gt_labels
#获取每一张图像中目标物体的label信息
def load_pascal_annotation(self,index):
imname = os.path.join(self.data_path,'JPEGImages',index + '.jpg')
im = cv2.imread(imname)
#缩放比
h_ratio = 1.0 * self.image_size / im.shape[0]
w_ratio = 1.0 * self.image_size / im.shape[1]
label = np.zeros((self.cell_size,self.cell_size,25))
filename = os.path.join(self.data_path,'Annotations',index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')#获取目标物体的信息
for obj in objs:
bbox = obj.find('bndbox')#目标物体的box
#对目标物体的box进行相应的缩放处理
x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio,self.image_size - 1),0)
y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio,self.image_size - 1),0)
x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio,self.image_size - 1),0)
y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio,self.image_size - 1),0)
cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]#box转换为centerx,centery,w,h
x_ind = int(boxes[0] * self.cell_size / self.image_size)#将图像坐标转换为以7为单元的坐标
y_ind = int(boxes[1] * self.cell_size / self.image_size)
if label[y_ind,x_ind,0] == 1:
continue
label[y_ind,x_ind,0] = 1
label[y_ind,x_ind,1:5] = boxes
label[y_ind,x_ind,5 + cls_ind] = 1
return label,len(objs)
timer.py主要用于打印代码运行过程中耗时信息
import time
import datetime
class Timer(object):
'''
A simple timer.
'''
def __init__(self):
self.init_time = time.time()
self.total_time = 0.
self.calls = 0
self.start_time = 0.
self.diff = 0.
self.average_time = 0.
self.remain_time = 0.
def tic(self):
# using time.time instead of time.clock because time time.clock
# does not normalize for multithreading
self.start_time = time.time()
def toc(self, average=True):
self.diff = time.time() - self.start_time
self.total_time += self.diff
self.calls += 1
self.average_time = self.total_time / self.calls
if average:
return self.average_time
else:
return self.diff
def remain(self, iters, max_iters):
if iters == 0:
self.remain_time = 0
else:
self.remain_time = (time.time() - self.init_time) * \
(max_iters - iters) / iters
return str(datetime.timedelta(seconds=int(self.remain_time)))
test.py是网络的测试程序
import os
import cv2
import argparse
import numpy as np
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
class Detector(object):
def __init__(self,net,weights_file):
self.net = net#加载网络图
self.weights_file = weights_file#加载权重
self.classes = cfg.CLASSES#加载类别
self.num_class = len(self.classes)#一共有多少类
self.image_size = cfg.IMAGE_SIZE#图片的大小
self.cell_size = cfg.CELL_SIZE #没个cell的大小
self.boxes_per_cell = cfg.BOXES_PER_CELL #没个cell有多少个boxes
self.threshold = cfg.THRESHOLD
self.iou_threshold = cfg.IOU_THRESHOLD
self.boundary1 = self.cell_size * self.cell_size * self.num_class #类别向量维度的边界
self.boundary2 = self.boundary1 + \
self.cell_size * self.cell_size * self.boxes_per_cell #box向量维度的边界
self.sess= tf.Session() #创建会话
self.sess.run(tf.global_variables_initializer()) #图中所有变量的初始化
#加载模型文件
print('Restoring weights from: ' + self.weights_file)
self.saver = tf.train.Saver()
self.saver.restore(self.sess,self.weights_file)
def draw_result(self, img, result):
for i in range(len(result)):
x = int(result[i][1])
y = int(result[i][2])
w = int(result[i][3] / 2)
h = int(result[i][4] / 2)
cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2)
cv2.rectangle(img, (x - w, y - h - 20),
(x + w, y - h), (125, 125, 125), -1)
lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA
cv2.putText(
img, result[i][0] + ' : %.2f' % result[i][5],
(x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
(0, 0, 0), 1, lineType)
def detect(self,img):
img_h,img_w,_ = img.shape
inputs= cv2.resize(img,(self.image_size,self.image_size))
inputs = cv2.cvtColor(inputs,cv2.COLOR_BGR2RGB).astype(np.float32) #bgr -> rgb
inputs = (inputs / 255.0) * 2.0 - 1.0
inputs = np.reshape(inputs,(1,self.image_size,self.image_size,3))
result = self.detect_from_cvmat(inputs)[0]
for i in range(len(result)):
result[i][1] *= (1.0 * img_w / self.image_size)
result[i][2] *= (1.0 * img_h / self.image_size)
result[i][3] *= (1.0 * img_w / self.image_size)
result[i][4] *= (1.0 * img_h / self.image_size)
return result
def detect_from_cvmat(self,inputs):
net_output = self.sess.run(self.net.logits,
feed_dict = {self.net.images:inputs})
results = []
for i in range(net_output.shape[0]):
results.append(self.interpret_output(net_output[i]))
return results
#对网络出来的结果筛选出符合条件的box,输出box信息以及对应的prob
def interpret_output(self,output):
probs = np.zeros((self.cell_size,self.cell_size,
self.boxes_per_cell,self.num_class)) #所有box对应每个类别的概率
class_probs = np.reshape(
output[0:self.boundary1],
(self.cell_size,self.cell_size,self.num_class)) #输出时,没个cell返回一个类别,[7,7,20]
scales = np.reshape(
output[self.boundary1:self.boundary2],
(self.cell_size,self.cell_size,self.boxes_per_cell)) #每个cell的confidence [7,7,2]
boxes = np.reshape(
output[self.boundary2:],
(self.cell_size,self.cell_size,self.boxes_per_cell,4)) #输出的坐标信息, [7,7,2,4]
offset = np.array(
[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell)
offset = np.transpose(
np.reshape(
offset,
[self.boxes_per_cell,self.cell_size,self.cell_size]),
(1,2,0)) #offset [2,7,7] -> [7,7,2]
boxes[:,:,:,0] += offset
boxes[:,:,:,1] += np.transpose(offset,(1,0,2))
boxes[:,:,:,:2] = 1.0 * boxes[:,:,:,0:2] / self.cell_size
boxes[:,:,:,2:] = np.square(boxes[:,:,:,2:])
boxes *= self.image_size #将目标坐标相对cell的便宜反映到448图像上
for i in range(self.boxes_per_cell):
for j in range(self.num_class):
probs[:,:,i,j] = np.multiply(
class_probs[:,:,j],scales[:,:,i]) #class * scales
filter_mat_probs = np.array(probs >= self.threshold,dtype = 'bool') #probs >= thre,则filter_mat_prob = 1
filter_mat_boxes = np.nonzero(filter_mat_probs)
boxes_filtered = boxes[filter_mat_boxes[0],
filter_mat_boxes[1],
filter_mat_boxes[2]]
probs_filtered = probs[filter_mat_probs]
classes_num_filtered = np.argmax(
filter_mat_probs,axis = 3)[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]]
argsort = np.array(np.argsort(probs_filtered))[::-1]
boxes_filtered = boxes_filtered[argsort] #筛选出box
probs_filtered = probs_filtered[argsort] #选出probs
classes_num_filtered = classes_num_filtered[argsort] #选出类别
#过滤重叠度太高的box
for i in range(len(boxes_filtered)):
if probs_filtered[i] == 0:
continue
for j in range(i + 1,len(boxes_filtered)):
if self.iou(boxes_filtered[i],boxes_filtered[j]) > self.iou_threshold:
probs_filtered[j] = 0.0
filter_iou = np.array(probs_filtered > 0.0,dtype = 'bool')
boxes_filtered = boxes_filtered[filter_iou]
probs_filtered = probs_filtered[filter_iou]
classes_num_filtered = classes_num_filtered[filter_iou]
result = []
for i in range(len(boxes_filtered)):
result.append([self.classes[classes_num_filtered[i]],
boxes_filtered[i][0],
boxes_filtered[i][1],
boxes_filtered[i][2],
boxes_filtered[i][3],
probs_filtered[i]])
return result
#计算iou
def iou(self,box1,box2):
tb = min(box1[0] + 0.5 * box1[2],box2[0] + 0.5 * box2[2]) - \
max(box1[0] - 0.5 * box1[2],box2[0] - 0.5 * box2[2]) #重叠区域的宽
lr = min(box1[1] + 0.5 * box1[3],box2[1] + 0.5 * box2[3]) - \
max(box1[1] - 0.5 * box1[3],box2[1] - 0.5 * box2[3]) #重叠区域的高
inter = 0 if tb < 0 or lr < 0 else tb * lr #求两个box的交集
return inter / (box1[2] * box1[3] + box2[2] * box2[3] - inter) #交集/并集
#从摄像头读取数据
def camera_detector(self,cap,wait = 10):
detect_timer = Timer()
ret,_ = cap.read()
while ret:
ret,frame = cap.read()
detect_timer.tic()
result = self.detect(frame)
detect_timer.toc()
print('Average detecting time: {:.3f}s'.format( #统计处理一帧数据的平均检测时间
detect_timer.average_time))
self.draw_result(frame,result)
cv2.imshow('camera',frame)
cv2.waiKey(wait)
ret,frame = cap.read()
#读取图片数据
def image_detector(self,imname,wait = 0):
detect_timer = Timer()
image = cv2.imread(imname)
detect_timer.tic()
result = self.detect(image)
detect_timer.toc()
print('Average detecting time: {:.3f}s'.format( #统计平均检测时间
detect_timer.average_time))
self.draw_result(image,result)
cv2.imshow('Image',image)
cv2.waitKey(wait)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--weights',default = 'YOLO_small.ckpt',type = str)
parser.add_argument('--weight_dir',default = 'weights',type = str)
parser.add_argument('--data_dir',default = 'data',type = str)
parser.add_argument('--gpu',default = '',type = str)
args = parser.parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
yolo = YOLONet(False)
weight_file = os.path.join(args.data_dir,args.weight_dir,args.weights)#权重路径
detector = Detector(yolo,weight_file)
imname = 'test/person.jpg'
detector.image_detector(imname)
if __name__ == '__main__':
main()
train.py是网络训练程序
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 4 17:35:45 2018
@author: Administrator
"""
import os
import argparse
import datetime
import numpy as np
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
from utils.pascal_voc import pascal_voc
slim = tf.contrib.slim
class Solver(object):
def __init__(self,net,data):
self.net = net
self.data = data
self.weights_file = cfg.WIEGHTS_FILE
self.max_iter = cfg.MAX_ITER
self.initial_learning_rate = cfg.LEARNING_RATE
self.decay_steps = cfg.DECAY_RATE
self.decay_rate = cfg.DECAY_RATE
self.staircase = cfg.STAIRCASE
self.save_iter = cfg.SAVE_ITER
self.output_dir = os.path.join(
cfg.OUTPUT_DIR,datetime.datetime.now().strftime('%Y_%m_%d_%H_%M'))
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.save_cfg()
self.variable_to_restore = tf.global_variables()
self.saver = tf.train.Saver(self.variable_to_restore,max_to_keep = None)
下面是程序运行的效果图
注:该程序非本人所写,基本是对别人的代码进行了一遍复现与讲解,若有不当之处,请指教,谢谢!
csdn(包含模型文件)代码下载链接:https://download.csdn.net/download/yongjiankuang/10627481
github(不包含模型文件):https://github.com/yongjiankuang/yolo