1.MTCNN 的优点及必须要了解基础点。
MTCNN 的 “MT”是指多任务学习(Multi-Task),在同一个任务中同时学习“分类识别”、“边框回归”,“关键点识别”。
多尺度问题一直是困扰检测准确性的一个难点。MTCNN使用图像金字塔来解决目标多尺度问题。(图像金字塔百度上介绍非常多,我这里不过多叙述)。
P-NET的网络模型时用单尺度(12X12)的图片训练出来的,想要识别各种尺度的人脸更准确,需要把待识别的人脸尺度先按照一定的比例,多次等比例缩放(缩一次识别一次,最后缩到接近12x12)
缺点是非常慢,生成图片金字塔慢,每种尺度的图片都需要喂入模型中,相当于执行了多次模型推断流程。
MTCNN算法可以接受任意尺度的图片。第一阶段的P-NET是一个全卷积网络,卷积,池化、非线性激活都是可以接受任意尺度矩阵的运算,**但全连接运算是需要规定输入。**则输入的图片尺度需要固定,如果没有全连接层,图片尺度可以是任意的,(当然有例外:有即包含全连接层也能接受任意尺度的图片结构【Pyramid Pooling 空间金字塔池化】可以百度)
设置适合的最小人脸尺度和缩放因子可以优化计算效率,官方经验是0.709。minsize 是指你认为图片中需要识别人脸的最小尺度,factor是指每次对边缩放的倍数。P-NET预测阶段会多次缩放原图得到图片金字塔,目的是为了让缩放后的图片中的人脸与P-NET训练时候的图片尺度(12px * 12px)接近,先把原图等比例缩放 “【12 / minsize】” 。即 (原图大小 x【12 / minsize】)缩放一次 ,再按factor 用上一次的缩放结果不断缩放,直到最短边小于或等于12,推断出 minsize 越大,生成的“金字塔”层数越少,resize和pnet的计算量越小。
在输入模型前对图片每个像素做(x - 127.5)/ 128 的操作。 此操作可以使图片像素归一化,加快收敛熟读,由于图片每个像素点是 [0-255] 的数,且都是非负数,加入此操作,可以把 [0-255] 映射为(-1,1)。有正有负的输入,收敛速度更快,训练需要此操作,预测时也需要此操作。
边框回归我们会在代码中体现,这里不多做叙述。
2.下面我们开始进入代码模式。
进行利用脚本获取P_net训练集。(12px * 12px)大小的图片。neg、pos、part、
gen_data_pent.py
import sys
import numpy as np
import cv2
import os
import numpy.random as npr
stdsize = 12
# im_dir = "samples"
pos_save_dir = str(stdsize) + "/positive"
part_save_dir = str(stdsize) + "/part"
neg_save_dir = str(stdsize) + '/negative'
save_dir = "12"
def IoU(pr_box, boxes):
"""Compute IoU between detect box and gt boxes
Parameters:
----------
box: numpy array , shape (5, ): x1, y1, x2, y2, score
input box
boxes: numpy array, shape (n, 4): x1, y1, x2, y2
input ground truth boxes
Returns:
-------
ovr: numpy.array, shape (n, )
IoU
"""
# print("随机锚框:",pr_box)
box_area = (pr_box[2] - pr_box[0] + 1) * (pr_box[3] - pr_box[1] + 1)
# print("随机面积box_area:",box_area)
# print("(boxes[:, 2] - boxes[:, 0] + 1):",(boxes[:, 2] - boxes[:, 0] + 1))
#XML真实区域 X2-X1 +1 = W Y2-Y1 = H W*H
area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
# print("真实面积area:",area)
# print("probx[0]",pr_box[0])
# boxes[:, 0]代表取boxes这个nx4矩阵所有行的第一列数据
xx1 = np.maximum(pr_box[0], boxes[:, 0])
# print("xx1",xx1)
yy1 = np.maximum(pr_box[1], boxes[:, 1])
# print("yy1",yy1)
xx2 = np.minimum(pr_box[2], boxes[:, 2])
# print("xx2",xx2)
yy2 = np.minimum(pr_box[3], boxes[:, 3])
# print("yy2",yy2)
# compute the width and height of the bounding box
# print("xx2-xx1",(xx2-xx1))
w = np.maximum(0, xx2 - xx1 + 1)
h = np.maximum(0, yy2 - yy1 + 1)
# inter_area = (xx1 - xx2 + 1) * (yy1 - yy2 + 1)
# w = np.max(xx1,yy1)
inter = w * h
# print("inter",inter_area)
ovr = inter / (box_area + area - inter)
print("IOU:",ovr)
return ovr
# 生成一系列文件夹用于存储三类样本
def mkr(dr):
if not os.path.exists(dr):
os.mkdir(dr)
mkr(save_dir)
mkr(pos_save_dir)
mkr(part_save_dir)
mkr(neg_save_dir)
# 生成一系列txt文档用于存储Positive,Negative,Part三类数据的信息
f1 = open(os.path.join(save_dir, 'pos_' + str(stdsize) + '.txt'), 'w')
f2 = open(os.path.join(save_dir, 'neg_' + str(stdsize) + '.txt'), 'w')
f3 = open(os.path.join(save_dir, 'part_' + str(stdsize) + '.txt'), 'w')
# 读取label.txt pt[4,2]+class[1],四个顶点坐标,加一位分类数据
annotations= np.load("labels8.npy")#[:100,4].astype(np.float32)
imgs=np.load("imgs8.npy")
num = len(annotations)
print("%d pics in total" % num)
p_idx = 0 # positive
n_idx = 0 # negative
d_idx = 0 # dont care
idx = 0
box_idx = 0
#len(annotations) 只需要100张图片
for i in range(100):
boxes = annotations[i][0:8].reshape(-1, 2)
ix=boxes[:,0].min()
iy=boxes[:,1].min()
ax=boxes[:,0].max()
ay=boxes[:,1].max()
boxes=np.array([[ix,iy,ax,ay]])
#取最大最小
img = imgs[i]
# print(img.shape)
idx += 1
if idx % 100 == 0:
print(idx, "images done")
height, width, channel = img.shape
print(img.shape)
neg_num = 0
while neg_num < 50:
# 生成随机数,对每张数据集中的图像进行切割,生成一系列小的图像
size = npr.randint(stdsize, min(width, height) / 2)
nx = npr.randint(0, width - size)
ny = npr.randint(0, height - size)
crop_box = np.array([nx, ny, nx + size, ny + size])
# print(crop_box)
# print("boxes",boxes)
# 计算小的图像与标注产生的检测框之间的IoU
Iou = IoU(crop_box, boxes)
# print(Iou)
cropped_im = img[ny : ny + size, nx : nx + size, :]
resized_im = cv2.resize(cropped_im, (stdsize, stdsize), interpolation=cv2.INTER_LINEAR)
if np.max(Iou) < 0.3:
# Iou with all gts must below 0.3
save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx)
f2.write(str(stdsize)+"/negative/%s"%n_idx + ' 0\n')
cv2.imwrite(save_file, resized_im)
n_idx += 1
neg_num += 1
for box in boxes:
print(box)
# box (x_left, y_top, x_right, y_bottom)
x1, y1, x2, y2 = box
w = x2 - x1 + 1
h = y2 - y1 + 1
# max(w, h) < 40:参数40表示忽略的最小的脸的大小
# in case the ground truth boxes of small faces are not accurate
if max(w, h) < 20 or x1 < 0 or y1 < 0:
continue
# 生成与gt有重叠的反面例子
for i in range(5):
size = npr.randint(stdsize, min(width, height) / 2)
# delta_x and delta_y are offsets of (x1, y1)
delta_x = npr.randint(max(-size, -x1), w)
delta_y = npr.randint(max(-size, -y1), h)
nx1 = int(max(0, x1 + delta_x))
ny1 = int(max(0, y1 + delta_y))
if nx1 + size > width or ny1 + size > height:
continue
crop_box = np.array([nx1, ny1, nx1 + size, ny1 + size])
Iou = IoU(crop_box, boxes)
# cropped_im = img[ny: ny + size, nx: nx + size, :]
cropped_im = img[ny1 : ny1 + size, nx1 : nx1 + size, :]
resized_im = cv2.resize(cropped_im, (stdsize, stdsize), interpolation=cv2.INTER_LINEAR)
if np.max(Iou) < 0.3:
# Iou with all gts must below 0.3
save_file = os.path.join(neg_save_dir, "%s.jpg" % n_idx)
f2.write(str(stdsize)+"/negative/%s" % n_idx + ' 0\n')
cv2.imwrite(save_file, resized_im)
n_idx += 1
# generate positive examples and part faces
for i in range(20):
size = npr.randint(int(min(w, h) * 0.8), np.ceil(1.25 * max(w, h)))
# delta here is the offset of box center
delta_x = npr.randint(-w * 0.2, w * 0.2)
delta_y = npr.randint(-h * 0.2, h * 0.2)
nx1 = max(x1 + w / 2 + delta_x - size / 2, 0)
ny1 = max(y1 + h / 2 + delta_y - size / 2, 0)
nx2 = nx1 + size
ny2 = ny1 + size
if nx2 > width or ny2 > height:
continue
crop_box = np.array([nx1, ny1, nx2, ny2])
offset_x1 = (x1 - nx1) / float(size)
offset_y1 = (y1 - ny1) / float(size)
offset_x2 = (x2 - nx2) / float(size)
offset_y2 = (y2 - ny2) / float(size)
cropped_im = img[int(ny1):int(ny2), int(nx1):int(nx2), :]
resized_im = cv2.resize(cropped_im, (stdsize, stdsize), interpolation=cv2.INTER_LINEAR)
box_ = box.reshape(1, -1)
if IoU(crop_box, box_) >=0.7: #0.65:#考虑旋转
save_file = os.path.join(pos_save_dir, "%s.jpg"%p_idx)
f1.write(str(stdsize)+"/positive/%s"%p_idx + ' 1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
p_idx += 1
elif IoU(crop_box, box_) >=0.5: # 0.4:#考虑旋转
save_file = os.path.join(part_save_dir, "%s.jpg"%d_idx)
f3.write(str(stdsize)+"/part/%s"%d_idx + ' -1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
d_idx += 1
box_idx += 1
print("%s images done, pos: %s part: %s neg: %s"%(idx, p_idx, d_idx, n_idx))
f1.close()
f2.close()
f3.close()
我们要把 三类txt 文本进行合并。以便制作训练集。
import sys
import os
save_dir = "12"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
f1 = open(os.path.join(save_dir, 'pos_%s.txt'%(save_dir)), 'r')
f2 = open(os.path.join(save_dir, 'neg_%s.txt'%(save_dir)), 'r')
f3 = open(os.path.join(save_dir, 'part_%s.txt'%(save_dir)), 'r')
pos = f1.readlines()
neg = f2.readlines()
part = f3.readlines()
f = open(os.path.join(save_dir, 'label-train%s.txt'%(save_dir)), 'w')
for i in range(int(len(pos))):
p = pos[i].find(" ") + 1
pos[i] = pos[i][:p-1] + ".jpg " + pos[i][p:-1] + "\n"
f.write(pos[i])
for i in range(int(len(neg))):
p = neg[i].find(" ") + 1
neg[i] = neg[i][:p-1] + ".jpg " + neg[i][p:-1] + " -1 -1 -1 -1\n"
f.write(neg[i])
for i in range(int(len(part))):
p = part[i].find(" ") + 1
part[i] = part[i][:p-1] + ".jpg " + part[i][p:-1] + "\n"
f.write(part[i])
f1.close()
f2.close()
f3.close()
看到这个label-train.txt 文件。这就是我们需要的训练集了。
3.但是tensorflow 2.0去直接训练 txt 格式,读取速度慢,导致训练速度停滞。为了提高读取速度,我将该txt格式转换成 tfrecord 格式。
gen_tfrecord.py
import os
import random
import sys
import tensorflow as tf
import cv2
from PIL import Image
def _int64_feature(value):
"""Wrapper for insert int64 feature into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _float_feature(value):
"""Wrapper for insert float features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _bytes_feature(value):
"""Wrapper for insert bytes features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def _process_image_withoutcoder(filename):
"""
利用cv2将filename指向的图片tostring
"""
image = cv2.imread(filename)
# transform data into string format
image_data = image.tostring()
assert len(image.shape) == 3
height = image.shape[0]
width = image.shape[1]
assert image.shape[2] == 3
# return string data and initial height and width of the image
return image_data, height, width
def _convert_to_example_simple(image_example, image_buffer):
"""
covert to tfrecord file
Parameter
------------
image_example: dict, an image example
image_buffer: string, JPEG encoding of RGB image
Return
-----------
Example proto
"""
class_label = image_example['label']
bbox = image_example['bbox']
roi = [bbox['xmin'], bbox['ymin'], bbox['xmax'], bbox['ymax']]
# landmark = [bbox['xlefteye'],bbox['ylefteye'],bbox['xrighteye'],bbox['yrighteye'],bbox['xnose'],bbox['ynose'],
# bbox['xleftmouth'],bbox['yleftmouth'],bbox['xrightmouth'],bbox['yrightmouth']]
example = tf.train.Example(features=tf.train.Features(feature={
'image/encoded': _bytes_feature(image_buffer),
'image/label': _int64_feature(class_label),
'image/roi': _float_feature(roi),
# 'image/landmark': _float_feature(landmark)
}))
return example
# 从图片和注释文件里加载数据并将其添加到TFRecord里
# 参数(变量):filename:存有数据的字典;tfrecord_writer:用来写入TFRecord的writer
def _add_to_tfrecord(filename, image_example, tfrecord_writer):
# print('---', filename)
# imaga_data:转化为字符串的图片
# height:图片原始高度
# width:图片原始宽度
# image_example:包含图片信息的字典
# print(filename)
image_data, height, width = _process_image_withoutcoder(filename)
example = _convert_to_example_simple(image_example, image_data)
tfrecord_writer.write(example.SerializeToString()) # 将imaga_data转化到image_example中并写入tfrecord
def _get_output_filename(output_dir,net):
# 定义一下输出的文件名
# return '%s/%s_%s_%s.tfrecord' % (output_dir, name, net, st)
# st = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# time.strftime() 函数接收以时间元组,并返回以可读字符串表示的当地时间,格式由参数format决定:time.strftime(format[, t]),用来输出当前时间
# 返回的是'../../DATA/imglists/PNet/train_PNet_landmark.tfrecord'
return '%s/train_%s_landmark.tfrecord' % (output_dir,net)
def run(dataset_dir,net,output_dir,shuffling=False):
"""
运行转换操作
Args:
dataset_dir: 数据集所在的数据集目录
output_dir: 输出目录
"""
# tfrecord name
tf_filename = _get_output_filename(output_dir,net) # '../../DATA/imglists/PNet/train_PNet_landmark.tfrecord'
if tf.io.gfile.exists(tf_filename): # tf.io.gfile模块提供了文件操作的API,包括文件的读取、写入、删除、复制等等
print('Dataset files already exist. Exiting without re-creating them.') # 判断是否存在同名文件
return
# 获得数据集,并打乱顺序
dataset = get_dataset(dataset_dir)
print(dataset)
# filenames = dataset['filename']
if shuffling:
tf_filename = tf_filename + '_shuffle'
# random.seed(12345454)
random.shuffle(dataset) # 打乱dataset数据集的顺序
# Process dataset files.
# write the data to tfrecord
print('lala')
with tf.io.TFRecordWriter(tf_filename) as tfrecord_writer:
for i, image_example in enumerate(dataset): # 读取dataset的索引和内容
if (i + 1) % 1 == 0:
sys.stdout.write('\r>> %d/%d images has been converted' % (
i + 1, len(dataset))) # 输出“x00/ len(dataset) images has been converted”
sys.stdout.flush() # 以一定间隔时间刷新输出
filename = image_example['filename'] # 赋值
_add_to_tfrecord(filename, image_example, tfrecord_writer)
# 最后,编写标签文件
# labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
# dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
print('\nFinished converting the MTCNN dataset!')
def get_dataset(dir):
# 获取文件名字,标签和注释
item = 'label-train%s.txt'%(dir)
dataset_dir = os.path.join(dir, item) # dataset_dir = '../../DATA/imglists/PNet/train_PNet_landmark.txt'
# print(dataset_dir)
imagelist = open(dataset_dir, 'r') # 以只读的形式打开train_PNet_landmark.txt,并传入imagelist里面
dataset = [] # 新建列表
for line in imagelist.readlines(): # 按行读取imagelist里面的内容
info = line.strip().split(' ') # .strip().split()去除每一行首尾空格并且以空格为分隔符读取内容到info里面
data_example = dict() # 新建字典
bbox = dict()
data_example['filename'] = info[0] # filename=info[0]
# print(data_example['filename'])
data_example['label'] = int(info[1]) # label=info[1],info[1]的值有四种可能,1,0,-1,-2;分别对应着正、负、无关、关键点样本
bbox['xmin'] = 0 # 初始化bounding box的值
bbox['ymin'] = 0
bbox['xmax'] = 0
bbox['ymax'] = 0
# bbox['xlefteye'] = 0 # 初始化人脸坐标的值
# bbox['ylefteye'] = 0
# bbox['xrighteye'] = 0
# bbox['yrighteye'] = 0
# bbox['xnose'] = 0
# bbox['ynose'] = 0
# bbox['xleftmouth'] = 0
# bbox['yleftmouth'] = 0
# bbox['xrightmouth'] = 0
# bbox['yrightmouth'] = 0
if len(info) == 6: # 当info的长度等于6时,表示此时的info是正样本或者无关样本
bbox['xmin'] = float(info[2])
bbox['ymin'] = float(info[3])
bbox['xmax'] = float(info[4])
bbox['ymax'] = float(info[5])
# if len(info) == 12: # 当info的长度等于12时,表示此时的info是landmark样本
# bbox['xlefteye'] = float(info[2])
# bbox['ylefteye'] = float(info[3])
# bbox['xrighteye'] = float(info[4])
# bbox['yrighteye'] = float(info[5])
# bbox['xnose'] = float(info[6])
# bbox['ynose'] = float(info[7])
# bbox['xleftmouth'] = float(info[8])
# bbox['yleftmouth'] = float(info[9])
# bbox['xrightmouth'] = float(info[10])
# bbox['yrightmouth'] = float(info[11])
data_example['bbox'] = bbox # 将bounding box值传入字典
dataset.append(data_example) # 将data_example字典内容传入列表dataset
return dataset # 返回的是dataset,datase是个列表,但里面每个元素都是一个字典,每个字典都含有3个key,分别是filename、label和bounding box
if __name__ == '__main__':
dir = '12'
net = 'PNet'
output_directory = '12'
run(dir,net,output_directory,shuffling=True)
4.我们对训练集进行了编码,那么我在读取该文件时就需要解码。我们编写解码函数。
read_tfrecord.py
import tensorflow as tf
import numpy as np
def image_color_distort(inputs):
inputs = tf.image.random_contrast(inputs, lower=0.5, upper=1.5)
inputs = tf.image.random_brightness(inputs, max_delta=0.2)
inputs = tf.image.random_hue(inputs,max_delta= 0.2)
inputs = tf.image.random_saturation(inputs,lower = 0.5, upper= 1.5)
return inputs
def red_tf(imgs,net_size):
raw_image_dataset = tf.data.TFRecordDataset(imgs).shuffle(1000)
image_feature_description = {
'image/encoded': tf.io.FixedLenFeature([], tf.string),
'image/label': tf.io.FixedLenFeature([], tf.int64),
'image/roi': tf.io.FixedLenFeature([4], tf.float32),
}
def _parse_image_function(example_proto):
# Parse the input tf.Example proto using the dictionary above.
return tf.io.parse_single_example(example_proto, image_feature_description)
parsed_image_dataset = raw_image_dataset.map(_parse_image_function)
print(parsed_image_dataset)
image_batch = []
label_batch = []
bbox_batch = []
for image_features in parsed_image_dataset:
image_raw = tf.io.decode_raw(image_features['image/encoded'],tf.uint8)
# 将值规划在[-1,1]内
images = tf.reshape(image_raw, [net_size, net_size, 3])
image = (tf.cast(images, tf.float32) - 127.5) / 128
#图像变色
image = image_color_distort(image)
image_batch.append(image)
label = tf.cast(image_features['image/label'], tf.float32)
label_batch.append(label)
roi = tf.cast(image_features['image/roi'], tf.float32)
bbox_batch.append(roi)
return image_batch,label_batch,bbox_batch