前言
本文为9月2日目标检测学习笔记——自然场景下文本检测,分为三个章节:
- ICDAR 数据集;
- EAST 模型;
- MTCNN 模型。
一、ICDAR 数据集
1、数据标注格式
- 左边:原图;
- 中间:标注的图像;
- 右边:ground truth 文件内容,按顺时针顺序的坐标,最后是 words;
- ###:don’t care。
二、EAST 模型
- 使用 FCN 生成多尺度融合的特征图;
- 支持旋转矩形框;
网络结构:
1、参数修改
- multigpu_train.py:
tf.app.flags.DEFINE_integer('input_size', 512, '')
tf.app.flags.DEFINE_integer('batch_size_per_gpu', 2, '')
tf.app.flags.DEFINE_integer('num_readers', 1, '')
tf.app.flags.DEFINE_float('learning_rate', 0.0001, '')
tf.app.flags.DEFINE_integer('max_steps', 100000, '')
tf.app.flags.DEFINE_float('moving_average_decay', 0.997, '')
tf.app.flags.DEFINE_string('gpu_list', '0', '')
tf.app.flags.DEFINE_string('checkpoint_path', '/tmp/east_resnet_v1_50_rbox/', '')
tf.app.flags.DEFINE_boolean('restore', False, 'whether to resotre from checkpoint')
tf.app.flags.DEFINE_integer('save_checkpoint_steps', 1000, '')
tf.app.flags.DEFINE_integer('save_summary_steps', 100, '')
tf.app.flags.DEFINE_string('pretrained_model_path', None, '')
2、模型测试
- eval.py:
tf.app.flags.DEFINE_string('test_data_path', './test/', '')
tf.app.flags.DEFINE_string('gpu_list', '0', '')
tf.app.flags.DEFINE_string('checkpoint_path', './model/', '')
tf.app.flags.DEFINE_string('output_dir', './out/', '')
tf.app.flags.DEFINE_bool('no_write_images', False, 'do not write images')
三、MTCNN
- Stage 1:proposal net;
- Stage 2:refine net;
- Stage 3:output net。
1、LFPW 数据打包
数据划分:
-
Negative:非人脸;
-
Positive:人脸;
-
Part faces:部分人脸;
-
gen_12net_data.py:
anno_file = "wider_face_train.txt"
im_dir = "../../DATA/WIDER_train/images"
pos_save_dir = "../../DATA/12/positive"
part_save_dir = "../../DATA/12/part"
neg_save_dir = '../../DATA/12/negative'
save_dir = "../../DATA/12"
- gen_landmark_aug_12.py:
dstdir = "../../DATA/12/train_PNet_landmark_aug"
OUTPUT = '../../DATA/12'
data_path = '../../DATA'
if not exists(OUTPUT):
os.mkdir(OUTPUT)
if not exists(dstdir):
os.mkdir(dstdir)
assert (exists(dstdir) and exists(OUTPUT))
# train data
net = "PNet"
#the file contains the names of all the landmark training data
train_txt = "trainImageList.txt"
imgs,landmarks = GenerateData(train_txt,data_path,net,argument=True )
2、模型训练
- P-Net:
#data path
base_dir = '../../DATA/imglists/PNet'
model_name = 'MTCNN'
#model_path = '../data/%s_model/PNet/PNet' % model_name
#with landmark
model_path = '../data/%s_model/PNet_landmark/PNet' % model_name
prefix = model_path
end_epoch = 30
display = 100
lr = 0.001
train_PNet(base_dir, prefix, end_epoch, display, lr)
- R-Net:
base_dir = '../../DATA/imglists_noLM/RNet'
model_name = 'MTCNN'
model_path = '../data/%s_model/RNet_No_Landmark/RNet' % model_name
prefix = model_path
end_epoch = 22
display = 100
lr = 0.001
train_RNet(base_dir, prefix, end_epoch, display, lr)
- O-Net:
base_dir = '../../DATA/imglists/ONet'
model_name = 'MTCNN'
model_path = '../data/%s_model/ONet_landmark/ONet' % model_name
prefix = model_path
end_epoch = 22
display = 10
lr = 0.001
train_ONet(base_dir, prefix, end_epoch, display, lr)
3、模型测试
#coding:utf-8
import sys
sys.path.append('..')
from Detection.MtcnnDetector import MtcnnDetector
from Detection.detector import Detector
from Detection.fcn_detector import FcnDetector
from Detection.train_models import P_Net, R_Net, O_Net
from prepare_data.loader import TestLoader
import cv2
import os
import numpy as np
test_mode = "PNet"
thresh = [0.6, 0.7, 0.7]
min_face_size = 20
stride = 2
slide_window = False
shuffle = False
detectors = [None, None, None]
prefix = ['../data/MTCNN_model/PNet_No_landmark/PNet', '../data/MTCNN_model/RNet_landmark/RNet', '../data/MTCNN_model/ONet_landmark/ONet']
epoch = [30, 14, 16]
batch_size = [2048, 64, 16]
model_path = ['%s-%s' % (x, y) for x, y in zip(prefix, epoch)]
# load pnet model
if slide_window:
PNet = Detector(P_Net, 12, batch_size[0], model_path[0])
else:
PNet = FcnDetector(P_Net, model_path[0])
detectors[0] = PNet
# load rnet model
if test_mode in ["RNet", "ONet"]:
RNet = Detector(R_Net, 24, batch_size[1], model_path[1])
detectors[1] = RNet
# load onet model
if test_mode == "ONet":
ONet = Detector(O_Net, 48, batch_size[2], model_path[2])
detectors[2] = ONet
mtcnn_detector = MtcnnDetector(detectors=detectors, min_face_size=min_face_size,
stride=stride, threshold=thresh, slide_window=slide_window)
gt_imdb = []
#gt_imdb.append("35_Basketball_Basketball_35_515.jpg")
#imdb_ = dict()"
#imdb_['image'] = im_path
#imdb_['label'] = 5
path = "../../DATA/test/lfpw_testImage"
for item in os.listdir(path):
gt_imdb.append(os.path.join(path,item))
test_data = TestLoader(gt_imdb)
all_boxes,landmarks = mtcnn_detector.detect_face(test_data)
count = 0
for imagepath in gt_imdb:
print(imagepath)
image = cv2.imread(imagepath)
for bbox in all_boxes[count]:
cv2.putText(image,str(np.round(bbox[4],2)),(int(bbox[0]),int(bbox[1])),cv2.FONT_HERSHEY_TRIPLEX,1,color=(255,0,255))
cv2.rectangle(image, (int(bbox[0]),int(bbox[1])),(int(bbox[2]),int(bbox[3])),(0,0,255))
'''
for landmark in landmarks[count]:
for i in range(len(landmark)//2):
cv2.circle(image, (int(landmark[2*i]),int(int(landmark[2*i+1]))), 3, (0,0,255))
'''
count = count + 1
#cv2.imwrite("result_landmark/%d.png" %(count),image)
cv2.imshow("lala",image)
cv2.waitKey(0)
'''
for data in test_data:
print type(data)
for bbox in all_boxes[0]:
print bbox
print (int(bbox[0]),int(bbox[1]))
cv2.rectangle(data, (int(bbox[0]),int(bbox[1])),(int(bbox[2]),int(bbox[3])),(0,0,255))
#print data
cv2.imshow("lala",data)
cv2.waitKey(0)
'''