博主最近研究完faster-rcnn,读完每一行代码,那叫一个算爽,现在开始读tensorflow版的Yolo V1.我参考的代码是github上下载下来的,
tensorflow-yolo
上次讲解完了代码的网络结构以及读取文件,损失函数等等,
传送门
train.py文件讲解
import os
import argparse
import datetime
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
from utils.pascal_voc import pascal_voc
slim = tf.contrib.slim
class Solver(object):
def __init__(self, net, data):
self.net = net
self.data = data
self.weights_file = cfg.WEIGHTS_FILE
self.max_iter = cfg.MAX_ITER
self.initial_learning_rate = cfg.LEARNING_RATE
self.decay_steps = cfg.DECAY_STEPS
self.decay_rate = cfg.DECAY_RATE
self.staircase = cfg.STAIRCASE
self.summary_iter = cfg.SUMMARY_ITER
self.save_iter = cfg.SAVE_ITER
self.output_dir = os.path.join(
cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M'))
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.save_cfg()
self.variable_to_restore = tf.global_variables()
self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None)
self.ckpt_file = os.path.join(self.output_dir, 'yolo')
self.summary_op = tf.summary.merge_all()
self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60)
self.global_step = tf.train.create_global_step()
self.learning_rate = tf.train.exponential_decay(
self.initial_learning_rate, self.global_step, self.decay_steps,
self.decay_rate, self.staircase, name='learning_rate')
self.optimizer = tf.train.GradientDescentOptimizer(
learning_rate=self.learning_rate)
self.train_op = slim.learning.create_train_op(
self.net.total_loss, self.optimizer, global_step=self.global_step)
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
self.sess = tf.Session(config=config)
self.sess.run(tf.global_variables_initializer())
if self.weights_file is not None:
print('Restoring weights from: ' + self.weights_file)
self.saver.restore(self.sess, self.weights_file)
self.writer.add_graph(self.sess.graph)
以上只是一些参数的设置
def train(self):
train_timer = Timer()
load_timer = Timer()
for step in range(1, self.max_iter + 1):
load_timer.tic()
images, labels = self.data.get()
load_timer.toc()
feed_dict = {self.net.images: images,
self.net.labels: labels}
if step % self.summary_iter == 0:
if step % (self.summary_iter * 10) == 0:
train_timer.tic()
summary_str, loss, _ = self.sess.run(
[self.summary_op, self.net.total_loss, self.train_op],
feed_dict=feed_dict)
train_timer.toc()
log_str = '''{} Epoch: {}, Step: {}, Learning rate: {},'''
''' Loss: {:5.3f}\nSpeed: {:.3f}s/iter,'''
'''' Load: {:.3f}s/iter, Remain: {}'''.format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.data.epoch,
int(step),
round(self.learning_rate.eval(session=self.sess), 6),
loss,
train_timer.average_time,
load_timer.average_time,
train_timer.remain(step, self.max_iter))
print(log_str)
else:
train_timer.tic()
summary_str, _ = self.sess.run(
[self.summary_op, self.train_op],
feed_dict=feed_dict)
train_timer.toc()
self.writer.add_summary(summary_str, step)
else:
train_timer.tic()
self.sess.run(self.train_op, feed_dict=feed_dict)
train_timer.toc()
if step % self.save_iter == 0:
print('{} Saving checkpoint file to: {}'.format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.output_dir))
self.saver.save(
self.sess, self.ckpt_file, global_step=self.global_step)
我们终点看train函数,首先Timer是作者定义的一个类,类的tic方法只是记录开始时间,toc是记录结束时间,
images, labels = self.data.get() 这个是得到label和image,就是我们上一篇讲的数据的加载。后面只是迭代多少次保存一下loss等信息,以及多少次保存一下模型。train还是比较简单的。我们再看test吧,
test.py
class Detector(object):
def __init__(self, net, weight_file):
self.net = net
self.weights_file = weight_file
self.classes = cfg.CLASSES
self.num_class = len(self.classes)
self.image_size = cfg.IMAGE_SIZE
self.cell_size = cfg.CELL_SIZE
self.boxes_per_cell = cfg.BOXES_PER_CELL
self.threshold = cfg.THRESHOLD
self.iou_threshold = cfg.IOU_THRESHOLD
self.boundary1 = self.cell_size * self.cell_size * self.num_class
self.boundary2 = self.boundary1 +\
self.cell_size * self.cell_size * self.boxes_per_cell
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
print('Restoring weights from: ' + self.weights_file)
self.saver = tf.train.Saver()
self.saver.restore(self.sess, self.weights_file)
依旧是一些参数的配置,比较简单,
def draw_result(self, img, result):
for i in range(len(result)):
x = int(result[i][1])
y = int(result[i][2])
w = int(result[i][3] / 2)
h = int(result[i][4] / 2)
cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2)
cv2.rectangle(img, (x - w, y - h - 20),
(x + w, y - h), (125, 125, 125), -1)
lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA
cv2.putText(
img, result[i][0] + ' : %.2f' % result[i][5],
(x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
(0, 0, 0), 1, lineType)
这个是把结果展示出来,将框和原图放在一起,注意的是cv2.rectangle里面是矩形框左上角和右下角的坐标,所以这个地发有一个转换。
def interpret_output(self, output):
probs = np.zeros((self.cell_size, self.cell_size,
self.boxes_per_cell, self.num_class))
class_probs = np.reshape(
output[0:self.boundary1],
(self.cell_size, self.cell_size, self.num_class))
scales = np.reshape(
output[self.boundary1:self.boundary2],
(self.cell_size, self.cell_size, self.boxes_per_cell))
boxes = np.reshape(
output[self.boundary2:],
(self.cell_size, self.cell_size, self.boxes_per_cell, 4))
offset = np.array(
[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell)
offset = np.transpose(
np.reshape(
offset,
[self.boxes_per_cell, self.cell_size, self.cell_size]),
(1, 2, 0))
boxes[:, :, :, 0] += offset
boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2))
boxes[:, :, :, :2] = 1.0 * boxes[:, :, :, 0:2] / self.cell_size
boxes[:, :, :, 2:] = np.square(boxes[:, :, :, 2:])
boxes *= self.image_size
for i in range(self.boxes_per_cell):
for j in range(self.num_class):
probs[:, :, i, j] = np.multiply(
class_probs[:, :, j], scales[:, :, i])
filter_mat_probs = np.array(probs >= self.threshold, dtype='bool')
filter_mat_boxes = np.nonzero(filter_mat_probs)
boxes_filtered = boxes[filter_mat_boxes[0],
filter_mat_boxes[1], filter_mat_boxes[2]]
probs_filtered = probs[filter_mat_probs]
classes_num_filtered = np.argmax(
filter_mat_probs, axis=3)[
filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]]
argsort = np.array(np.argsort(probs_filtered))[::-1]
boxes_filtered = boxes_filtered[argsort]
probs_filtered = probs_filtered[argsort]
classes_num_filtered = classes_num_filtered[argsort]
for i in range(len(boxes_filtered)):
if probs_filtered[i] == 0:
continue
for j in range(i + 1, len(boxes_filtered)):
if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold:
probs_filtered[j] = 0.0
filter_iou = np.array(probs_filtered > 0.0, dtype='bool')
boxes_filtered = boxes_filtered[filter_iou]
probs_filtered = probs_filtered[filter_iou]
classes_num_filtered = classes_num_filtered[filter_iou]
result = []
for i in range(len(boxes_filtered)):
result.append(
[self.classes[classes_num_filtered[i]],
boxes_filtered[i][0],
boxes_filtered[i][1],
boxes_filtered[i][2],
boxes_filtered[i][3],
probs_filtered[i]])
return result
上面这个方法,就是把预测结果准换回来,把它弄成对应图像上的长和宽,以及总共预测有多少个类别。
def detect_from_cvmat(self, inputs):
net_output = self.sess.run(self.net.logits,
feed_dict={self.net.images: inputs})
results = []
for i in range(net_output.shape[0]):
results.append(self.interpret_output(net_output[i]))
return results
这个地方就是预测,预测测试集的结果
def detect(self, img):
img_h, img_w, _ = img.shape
inputs = cv2.resize(img, (self.image_size, self.image_size))
inputs = cv2.cvtColor(inputs, cv2.COLOR_BGR2RGB).astype(np.float32)
inputs = (inputs / 255.0) * 2.0 - 1.0
inputs = np.reshape(inputs, (1, self.image_size, self.image_size, 3))
result = self.detect_from_cvmat(inputs)[0]
for i in range(len(result)):
result[i][1] *= (1.0 * img_w / self.image_size)
result[i][2] *= (1.0 * img_h / self.image_size)
result[i][3] *= (1.0 * img_w / self.image_size)
result[i][4] *= (1.0 * img_h / self.image_size)
return result
这个地方将预测的结果转换一下,因为我们的图像是resize了的,所以这个地方就是把对应坐标变回到原始图像中。