最近在做一个人脸的任务,所以,就玩了一下mtcnn!!将部分内容写成了博客的形式分享学习!博客内容较为简单,但是,能够实现需求。不足之处多多谅解!!谢谢~
目录
MTCNN TensorFlow版人脸检测算法实现流程
- 算法环境的搭建
- 数据集的准备
- MTCNN检测模型的训练
- 人脸检测模型的效果测试以及视频检测代码的修改
- 视频人脸检测demo
算法环境的搭建
实验机器配置为: intel i7-8700、gtx-2070 * 2、32g RAM
软件环境的搭建:nvidia-430 + cuda9.0 + cudnn7.3 + Anaconda3 +TensorFlow-gpu==1.12 (关于实验环境的搭建请参考本人之前的博客,安装软件环境时只需要修改对应的版本型号即可,安装流程是一样的)
【Notice:由于nvidia20系列显卡貌似只支持cuda10。所以,为了解决本机显卡对cuda9的使用以及tf-gpu1.12版本的使用,需要去NVIDIA官网下载并且安装对应的四个补丁即可(patch1、patch2、patch3、patch4);补丁下载链接: https://developer.nvidia.com/cuda-90-download-archive】
数据准备
将源码clone到自己本地project目录中:
git clone https://github.com/BobLiu20/mtcnn_tf
实验用到的数据集为:WIDER face dataset – WIDER_train.zip
Landmark dataset—train.zip
数据集下载链接:http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/
http://mmlab.ie.cuhk.edu.hk/archive/CNN_FacePoint.htm
数据集下载好后,将两部分数据解压缩到project的‘dataset
’目录下即可
人脸检测模型的训练
根据作者提供的教程,数据格式的转换以及模型的训练只需要按步骤运行两个脚本即可;
在projcet目录下分别运行:
./clearAll.sh
./runAll.sh (该脚本包括了数据格式的转换以及模型的训练)
待模型训练起来,耐心等待两天左右即可训练完;
模型测试以及视频检测代码的修改
得到训练好的模型后,测试. 当然,在公开数据集上训练出来的demo,效果不会差。
- 将你需要测试的照片复制到project目录‘
testing/images
’目录下 - 在
testing
目录下运行测试的代码:
python test_images.py –stage=onet
测试的结果会保存在result_onet 目录下;
检测效果如下:(测试照片来源于百度搜索)
利用MTCNN实现视频中人脸的实时检测
- 修改工程目录‘
detection/MtcnnDetecto.py
’代码
import cv2
import time
import numpy as np
import sys
import os
rootPath = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
sys.path.insert(0, rootPath)
from detection.nms import py_nms
from training.mtcnn_config import config
class MtcnnDetector(object):
def __init__(self,
detectors,
min_face_size=24,
stride=2,
threshold=[0.6, 0.7, 0.7],
scale_factor=0.79):
self.pnet_detector = detectors[0]
self.rnet_detector = detectors[1]
self.onet_detector = detectors[2]
self.min_face_size = min_face_size
self.stride = stride
self.thresh = threshold
self.scale_factor = scale_factor
def convert_to_square(self, bbox):
"""
convert bbox to square
Parameters:
----------
bbox: numpy array , shape n x 5
input bbox
Returns:
-------
square bbox
"""
square_bbox = bbox.copy()
h = bbox[:, 3] - bbox[:, 1] + 1
w = bbox[:, 2] - bbox[:, 0] + 1
max_side = np.maximum(h, w)
square_bbox[:, 0] = bbox[:, 0] + w * 0.5 - max_side * 0.5
square_bbox[:, 1] = bbox[:, 1] + h * 0.5 - max_side * 0.5
square_bbox[:, 2] = square_bbox[:, 0] + max_side - 1
square_bbox[:, 3] = square_bbox[:, 1] + max_side - 1
return square_bbox
def calibrate_box(self, bbox, reg):
"""
calibrate bboxes
Parameters:
----------
bbox: numpy array, shape n x 5
input bboxes
reg: numpy array, shape n x 4
bboxes adjustment
Returns:
-------
bboxes after refinement
"""
bbox_c = bbox.copy()
w = bbox[:, 2] - bbox[:, 0] + 1
w = np.expand_dims(w, 1)
h = bbox[:, 3] - bbox[:, 1] + 1
h = np.expand_dims(h, 1)
reg_m = np.hstack([w, h, w, h])
aug = reg_m * reg
bbox_c[:, 0:4] = bbox_c[:, 0:4] + aug
return bbox_c
def generate_bbox(self, cls_map, reg, scale, threshold):
"""
generate bbox from feature cls_map
Parameters:
----------
cls_map: numpy array , n x m
detect score for each position
reg: numpy array , n x m x 4
bbox
scale: float number
scale of this detection
threshold: float number
detect threshold
Returns:
-------
bbox array
"""
cellsize = 12
t_index = np.where(cls_map > threshold)
# find nothing
if t_index[0].size == 0:
return np.array([])
#offset
dx1, dy1, dx2, dy2 = [reg[t_index[0], t_index[1], i] for i in range(4)]
reg = np.array([dx1, dy1, dx2, dy2])
score = cls_map[t_index[0], t_index[1]]
boundingbox = np.vstack([np.round((self.stride * t_index[1]) / scale),
np.round((self.stride * t_index[0]) / scale),
np.round((self.stride * t_index[1] + cellsize) / scale),
np.round((self.stride * t_index[0] + cellsize) / scale),
score,
reg])
return boundingbox.T
def processed_image(self, img, scale):
height, width, channels = img.shape
new_height = int(height * scale) # resized new height
new_width = int(width * scale) # resized new width
new_dim = (new_width, new_height)
img_resized = cv2.resize(img, new_dim, interpolation=cv2.INTER_LINEAR) # resized image
img_resized = (img_resized - 127.5) / 128
return img_resized
def pad(self, bboxes, w, h):
"""
pad the the bboxes, alse restrict the size of it
Parameters:
----------
bboxes: numpy array, n x 5
input bboxes
w: float number
width of the input image
h: float number
height of the input image
Returns :
------
dy, dx : numpy array, n x 1
start point of the bbox in target image
edy, edx : numpy array, n x 1
end point of the bbox in target image
y, x : numpy array, n x 1
start point of the bbox in original image
ex, ex : numpy array, n x 1
end point of the bbox in original image
tmph, tmpw: numpy array, n x 1
height and width of the bbox
"""
tmpw, tmph = bboxes[:, 2] - bboxes[:, 0] + 1, bboxes[:, 3] - bboxes[:, 1] + 1
num_box = bboxes.shape[0]
dx, dy = np.zeros((num_box,)), np.zeros((num_box,))
edx, edy = tmpw.copy() - 1, tmph.copy() - 1
x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
tmp_index = np.where(ex > w - 1)
edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index]
ex[tmp_index] = w - 1
tmp_index = np.where(ey > h - 1)
edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index]
ey[tmp_index] = h - 1
tmp_index = np.where(x < 0)
dx[tmp_index] = 0 - x[tmp_index]
x[tmp_index] = 0
tmp_index = np.where(y < 0)
dy[tmp_index] = 0 - y[tmp_index]
y[tmp_index] = 0
return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph]
return_list = [item.astype(np.int32) for item in return_list]
return return_list
def detect_pnet(self, im):
"""Get face candidates through pnet
Parameters:
----------
im: numpy array
input image array
Returns:
-------
boxes: numpy array
detected boxes before calibration
boxes_c: numpy array
boxes after calibration
"""
h, w, c = im.shape
net_size = 12
current_scale = float(net_size) / self.min_face_size # find initial scale
im_resized = self.processed_image(im, current_scale)
current_height, current_width, _ = im_resized.shape
# for fcn
all_boxes = list()
while min(current_height, current_width) > net_size:
#return the result predicted by pnet
#cls_cls_map : H*w*2
#reg: H*w*4
cls_cls_map, reg = self.pnet_detector.predict(im_resized)
#boxes: num*9(x1,y1,x2,y2,score,x1_offset,y1_offset,x2_offset,y2_offset)
boxes = self.generate_bbox(cls_cls_map[:, :,1], reg, current_scale, self.thresh[0])
current_scale *= self.scale_factor
im_resized = self.processed_image(im, current_scale)
current_height, current_width, _ = im_resized.shape
if boxes.size == 0:
continue
keep = py_nms(boxes[:, :5], 0.5, 'Union')
boxes = boxes[keep]
all_boxes.append(boxes)
if len(all_boxes) == 0:
return None, None, None
all_boxes = np.vstack(all_boxes)
# merge the detection from first stage
keep = py_nms(all_boxes[:, 0:5], 0.7, 'Union')
all_boxes = all_boxes[keep]
boxes = all_boxes[:, :5]
bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1
bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1
# refine the boxes
boxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw,
all_boxes[:, 1] + all_boxes[:, 6] * bbh,
all_boxes[:, 2] + all_boxes[:, 7] * bbw,
all_boxes[:, 3] + all_boxes[:, 8] * bbh,
all_boxes[:, 4]])
boxes_c = boxes_c.T
return boxes, boxes_c, None
def detect_rnet(self, im, dets):
"""Get face candidates using rnet
Parameters:
----------
im: numpy array
input image array
dets: numpy array
detection results of pnet
Returns:
-------
boxes: numpy array
detected boxes before calibration
boxes_c: numpy array
boxes after calibration
"""
h, w, c = im.shape
dets = self.convert_to_square(dets)
dets[:, 0:4] = np.round(dets[:, 0:4])
[dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)
num_boxes = dets.shape[0]
cropped_ims = np.zeros((num_boxes, 24, 24, 3), dtype=np.float32)
for i in range(num_boxes):
tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
cropped_ims[i, :, :, :] = (cv2.resize(tmp, (24, 24))-127.5) / 128
#cls_scores : num_data*2
#reg: num_data*4
#landmark: num_data*10
cls_scores, reg, _ = self.rnet_detector.predict(cropped_ims)
cls_scores = cls_scores[:,1]
keep_inds = np.where(cls_scores > self.thresh[1])[0]
if len(keep_inds) > 0:
boxes = dets[keep_inds]
boxes[:, 4] = cls_scores[keep_inds]
reg = reg[keep_inds]
else:
return None, None, None
keep = py_nms(boxes, 0.6)
boxes = boxes[keep]
boxes_c = self.calibrate_box(boxes, reg[keep])
return boxes, boxes_c, None
def detect_onet(self, im, dets):
"""Get face candidates using onet
Parameters:
----------
im: numpy array
input image array
dets: numpy array
detection results of rnet
Returns:
-------
boxes: numpy array
detected boxes before calibration
boxes_c: numpy array
boxes after calibration
"""
h, w, c = im.shape
dets = self.convert_to_square(dets)
dets[:, 0:4] = np.round(dets[:, 0:4])
[dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)
num_boxes = dets.shape[0]
cropped_ims = np.zeros((num_boxes, 48, 48, 3), dtype=np.float32)
for i in range(num_boxes):
tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
cropped_ims[i, :, :, :] = (cv2.resize(tmp, (48, 48))-127.5) / 128
cls_scores, reg,landmark = self.onet_detector.predict(cropped_ims)
#prob belongs to face
cls_scores = cls_scores[:,1]
keep_inds = np.where(cls_scores > self.thresh[2])[0]
if len(keep_inds) > 0:
#pickout filtered box
boxes = dets[keep_inds]
boxes[:, 4] = cls_scores[keep_inds]
reg = reg[keep_inds]
landmark = landmark[keep_inds]
else:
return None, None, None
#width
w = boxes[:,2] - boxes[:,0] + 1
#height
h = boxes[:,3] - boxes[:,1] + 1
landmark[:,0::2] = (np.tile(w,(5,1)) * landmark[:,0::2].T + np.tile(boxes[:,0],(5,1)) - 1).T
landmark[:,1::2] = (np.tile(h,(5,1)) * landmark[:,1::2].T + np.tile(boxes[:,1],(5,1)) - 1).T
boxes_c = self.calibrate_box(boxes, reg)
boxes = boxes[py_nms(boxes, 0.6, "Minimum")]
keep = py_nms(boxes_c, 0.6, "Minimum")
boxes_c = boxes_c[keep]
landmark = landmark[keep]
return boxes, boxes_c,landmark
# use for video 这部分代码用于视屏人脸检测
def detect_video(self, img):
"""Detect face over image
"""
boxes = None
t = time.time()
# pnet
t1 = 0
if self.pnet_detector:
boxes, boxes_c, _ = self.detect_pnet(img)
if boxes_c is None:
return np.array([]), np.array([])
t1 = time.time() - t
t = time.time()
# rnet
t2 = 0
if self.rnet_detector:
boxes, boxes_c, _ = self.detect_rnet(img, boxes_c)
if boxes_c is None:
return np.array([]), np.array([])
t2 = time.time() - t
t = time.time()
# onet
t3 = 0
if self.onet_detector:
boxes, boxes_c, landmark = self.detect_onet(img, boxes_c)
if boxes_c is None:
return np.array([]), np.array([])
t3 = time.time() - t
t = time.time()
# print(
# "time cost " + '{:.3f}'.format(t1 + t2 + t3) + ' pnet {:.3f} rnet {:.3f} onet {:.3f}'.format(t1, t2,
# t3))
return boxes_c, landmark
def detect_face(self, test_data):
all_boxes = [] #save each image's bboxes
landmarks = []
batch_idx = 0
for databatch in test_data:
# print info
printStr = "\rDone images: {}\n".format(batch_idx)
sys.stdout.write(printStr)
sys.stdout.flush()
batch_idx += 1
im = databatch
# pnet
if self.pnet_detector:
#ignore landmark
boxes, boxes_c, landmark = self.detect_pnet(im)
if boxes_c is None:
all_boxes.append(np.array([]))
landmarks.append(np.array([]))
continue
# rnet
if self.rnet_detector:
#ignore landmark
boxes, boxes_c, landmark = self.detect_rnet(im, boxes_c)
if boxes_c is None:
all_boxes.append(np.array([]))
landmarks.append(np.array([]))
continue
# onet
if self.onet_detector:
boxes, boxes_c, landmark = self.detect_onet(im, boxes_c)
if boxes_c is None:
all_boxes.append(np.array([]))
landmarks.append(np.array([]))
continue
all_boxes.append(boxes_c)
landmarks.append(landmark)
return all_boxes,landmarks
2.修改测试的代码testing/test_images.py
(这部分代码仅供参考,本人代码能力有限,代码还可以更加优化,代码内还有点bug,哈哈)
#coding:utf-8
#author: AIBC-hxy
'''
MTCNN在视频流上进行人脸检测,
'''
import tensorflow as tf
import numpy as np
import os
import sys
from cv2 import cv2
rootPath = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
sys.path.insert(0, rootPath)
from training.mtcnn_model import P_Net, R_Net, O_Net
from tools.loader import TestLoader
from detection.MtcnnDetector import MtcnnDetector
from detection.detector import Detector
from detection.fcn_detector import FcnDetector
import time
import gc
from multiprocessing import Process, Manager
def net(stage):
detectors = [None, None, None]
if stage in ['pnet', 'rnet', 'onet']:
modelPath = os.path.join(rootPath, 'tmp/model/pnet/')
a = [b[5:-6] for b in os.listdir(modelPath) if b.startswith('pnet-') and b.endswith('.index')]
maxEpoch = max(map(int, a)) # auto match a max epoch model
modelPath = os.path.join(modelPath, "pnet-%d"%(maxEpoch))
print("Use PNet model: %s"%(modelPath))
detectors[0] = FcnDetector(P_Net,modelPath)
if stage in ['rnet', 'onet']:
modelPath = os.path.join(rootPath, 'tmp/model/rnet/')
a = [b[5:-6] for b in os.listdir(modelPath) if b.startswith('rnet-') and b.endswith('.index')]
maxEpoch = max(map(int, a))
modelPath = os.path.join(modelPath, "rnet-%d"%(maxEpoch))
print("Use RNet model: %s"%(modelPath))
detectors[1] = Detector(R_Net, 24, 1, modelPath)
if stage in ['onet']:
modelPath = os.path.join(rootPath, 'tmp/model/onet/')
a = [b[5:-6] for b in os.listdir(modelPath) if b.startswith('onet-') and b.endswith('.index')]
maxEpoch = max(map(int, a))
modelPath = os.path.join(modelPath, "onet-%d"%(maxEpoch))
print("Use ONet model: %s"%(modelPath))
detectors[2] = Detector(O_Net, 48, 1, modelPath)
return detectors
'''
python多进程:
'''
def receive(stack):
top = 100
cap = cv2.VideoCapture(0)
ret, frame = cap.read()
while True:
ret, frame = cap.read()
if ret:
stack.append(frame)
if len(stack) >= top:
del stack[:]
gc.collect()
def realse(stack):
print('Begin to get frame......')
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
detectors = net('onet')
mtcnnDetector = MtcnnDetector(detectors=detectors, min_face_size = 24, threshold=[0.9, 0.6, 0.7])
while True:
if len(stack) > 0:
image = stack.pop()
image = cv2.resize(image, (int(image.shape[1]/3), int(image.shape[0]/3)))
image = np.array(image)
boxes_c, _ = mtcnnDetector.detect_video(image)
for bbox in boxes_c:
x1 = int(bbox[0])
y1 = int(bbox[1])
x2 = int(bbox[2])
y2 = int(bbox[3])
cv2.rectangle(image, (x1, y1), (x2,y2), (0,0,255))
print('deteced face: ({},{}), ({},{})'.format(x1, y1, x2, y2))
cv2.imshow("Detected", image)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cv2.destroyAllWindows()
if __name__=='__main__':
t = Manager().list()
t1 = Process(target=receive, args=(t,))
t2 = Process(target=realse, args=(t,))
t1.start()
t2.start()
t1.join()
t2.terminate()
代码修改好后直接运行即可:
python test_images.py
视频检测效果截图如下(本人调整了检测框的大小,同时请忽略本人的大脸!!!):