

文章链接:《FaceBoxes: A CPU Real-time Face Detector with High Accuracy》



faceBox相当于在fasterRcnn的基础只保留了rpn结果,去掉了roi pooling,并且采用fpn,已经对anchor进行了改进,还有一点主干网络(不算什么亮点了)。

论文中的贡献有三个部分: (分别对应:主干网络、fpn、anchor)

(1)Rapidly Digested Convolutional Layers(RDCL)

(2)Multiple Scale Convolutional Layers(MSCL)

(3)Anchor densification strategy








import torch
import math
import itertools
import cv2
import numpy as np

class DataEncoder:
	def __init__(self):
		compute default boxes
		# 模型训练图片得大小为1024.
		scale = 1024.
		# 锚点得平铺间隔。  可以抽象成anchors的密度。 这里的情况下:也相当于相对1024的缩放倍数。
		steps = [s / scale for s in (32, 64, 128)]
		# 每一层anchors的大小,包括:Inception3 layer; Conv3 2 layer; Conv4 2 layer;
		sizes = [s / scale for s in (32, 256, 512)] # 当32改为64时,achor与label匹配的正样本数目更多
		aspect_ratios = ((1,2,4), (1,), (1,))
		# 对应Inception3 layer; Conv3 2 layer; Conv4 2 layer; 各层的featuremap大小。
		feature_map_sizes = (32, 16, 8)

		density = [[-3,-1,1,3],[-1,1],[0]] # density for output layer1
		# density = [[0],[0],[0]] # density for output layer1
		# 选用多用层, 用于最终的检测。
		num_layers = len(feature_map_sizes)
		boxes = []
		# 遍历每一层
		for i in range(num_layers):
			#  选择该层的特征图大小。 
			fmsize = feature_map_sizes[i]
			# print(len(boxes))
			# 下面是计算各featuremap中,所有的box。 使用for循环进行统计,steps[i]可以抽象为第i层的anchors的密度。
			for h,w in itertools.product(range(fmsize), repeat=2):
				# 按照anchor在特征图上平铺:对应中心坐标。
				cx = (w + 0.5)*steps[i]
				cy = (h + 0.5)*steps[i]
				# 这里可以理解为归一化的anchors的大小。 
				s = sizes[i]
				# 每一层的anchor大小的缩放比例, Inception3 layer层为:(1,2,4) ; Conv3 2 layer层为(1,); Conv4 2 layer层为(1,)。
				for j,ar in enumerate(aspect_ratios[i]):
					# Inception3 layer层除了使用了不同大小的anchor, 还加入了密集框,相当于对框做小的平移。
					if i == 0:
						for dx,dy in itertools.product(density[j], repeat=2):
							boxes.append((cx+dx/8.*s*ar, cy+dy/8.*s*ar, s*ar, s*ar))
						boxes.append((cx, cy, s*ar, s*ar))
		self.default_boxes = torch.Tensor(boxes)
	def test_iou(self):
		box1 = torch.Tensor([0,0,10,10])
		box1 = box1[None,:]
		box2 = torch.Tensor([[5,0,15,10],[5,0,15,10]])
		print('iou', self.iou(box1, box2))

	def iou(self, box1, box2):
		'''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].

		  box1: (tensor) bounding boxes, sized [N,4].
		  box2: (tensor) bounding boxes, sized [M,4].

		  (tensor) iou, sized [N,M].
		N = box1.size(0)
		M = box2.size(0)

		lt = torch.max( # left top
			box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
			box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]

		rb = torch.min( # right bottom
			box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
			box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]

		wh = rb - lt  # [N,M,2]
		wh[wh<0] = 0  # clip at 0
		inter = wh[:,:,0] * wh[:,:,1]  # [N,M]

		area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
		area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
		area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
		area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

		iou = inter / (area1 + area2 - inter)
		return iou

	def test_encode(self, boxes, img, label):
		# box = torch.Tensor([ 0.4003,0.0000,0.8409,0.4295])
		# box = box[None,:]
		# label = torch.LongTensor([1])
		# label = label[None,:]
		loc, conf = self.encode(boxes, label)
		print('conf', type(conf), conf.size(), conf.long().sum())
		print('loc', loc)
		# img = cv2.imread('test1.jpg')
		w,h,_ = img.shape
		for box in boxes:
			cv2.rectangle(img, (int(box[0]*w),int(box[1]*w)), (int(box[2]*w), int(box[3]*w)), (0,255,0))
		for i in range(len(self.default_boxes)):
			if conf[i] != 0:
		im = img.copy()
		# for i in range(42):
		# 	print(self.default_boxes[i]*w)

		for i in range(32*32*21):
			box_item = self.default_boxes[i]*w
			centerx, centery = int(box_item[0]), int(box_item[1])
			if conf[i] != 0:
				cv2.circle(im, (centerx, centery), 4, (0,255,0))
				cv2.circle(im, (centerx, centery), 1, (0,0,255))
		box = self.default_boxes[0]
		cv2.rectangle(im, (0,0), (int(box[2]*w), int(box[3]*w)), (0,255,0))
		box = self.default_boxes[16]
		cv2.rectangle(im, (0,0), (int(box[2]*w), int(box[3]*w)), (0,255,0))
		box = self.default_boxes[20]
		cv2.rectangle(im, (0,0), (int(box[2]*w), int(box[3]*w)), (0,255,0))
		cv2.imwrite('test_encoder_0.jpg', im)

		im = img.copy()
		for i in range(32*32*21, 32*32*21+16*16):
			box_item = self.default_boxes[i]*w
			centerx, centery = int(box_item[0]), int(box_item[1])
			if conf[i] != 0:
				cv2.circle(im, (centerx, centery), 4, (0,255,0))
				cv2.circle(im, (centerx, centery), 2, (0,0,255))
		box = self.default_boxes[32*32*21]
		cv2.rectangle(im, (0,0), (int(box[2]*w), int(box[3]*w)), (0,255,0))
		cv2.imwrite('test_encoder_1.jpg', im)

		im = img.copy()
		for i in range(32*32*21+16*16, len(self.default_boxes)):
			box_item = self.default_boxes[i]*w
			centerx, centery = int(box_item[0]), int(box_item[1])
			if conf[i] != 0:
				cv2.circle(im, (centerx, centery), 4, (0,255,0))
				cv2.circle(im, (centerx, centery), 2, (0,0,255))
		box = self.default_boxes[32*32*21+16*16]
		cv2.rectangle(im, (0,0), (int(box[2]*w), int(box[3]*w)), (0,255,0))
		cv2.imwrite('test_encoder_2.jpg', im)

		# for i in range(conf.size(0)):
			# if conf[i].numpy != 0:
				# print()

	def encode(self,boxes,classes,threshold=0.35):
		boxes:[num_obj, 4]
		default_box (x1,y1,x2,y2)
		return:boxes: (tensor) [num_obj,21824,4]
		classes:class label [obj,]
		# 做归一化后的boxes;
		boxes_org = boxes
		# 得到所有的default_boxes。
		default_boxes = self.default_boxes #[21824,4]
		num_default_boxes = default_boxes.size(0)
		# 图片中含有的人脸个数。
		num_obj=boxes.size(0)  #人脸个数
		#print('num_faces {}'.format(num_obj))
		# 计算真实box和预测box的iou,用于后面给每一个bounding box不管IOU大小,都设置一个与之IOU最大的default_box;并且每一个default_boxes对应到与之IOU最大的bounding box上
		iou = self.iou(
			torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
						default_boxes[:,:2] + default_boxes[:,2:]/2], 1))
		# iou = self.iou(boxes, default_boxes)
		#print('iou size {}'.format(iou.size()))
		max_iou, max_iou_index = iou.max(1) #为每一个bounding box不管IOU大小,都设置一个与之IOU最大的default_box
		iou, max_index= iou.max(0) #每一个default_boxes对应到与之IOU最大的bounding box上
		max_index.squeeze_(0)  # torch.LongTensor 21824
		# print('boxes', boxes.size(), boxes, 'max_index', max_index)

		max_index[max_iou_index] = torch.LongTensor(range(num_obj))

		import numpy as np
		a = np.array([1,2,3])
		b = np.array([1,1,2,2,1,1])
		print (a[b]) : [2 2 3 3 2 2]
		# 可以将box的维度 扩展到和default_boxes 一样,变成[21824,4], 方便后边的批量计算。
		boxes = boxes[max_index] # [21824,4] 是图像label
		variances = [0.1, 0.2]
		# 首先求box的中心坐标。 - default_boxes[:,:2]是为了不考虑具体坐标值,这里关注的是相对坐标,可以理解为一种平移预估。
		cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [21824,2]
		# 除以default_boxes宽高,
		cxcy /= variances[0] * default_boxes[:,2:]
		# 首先得到bounding box宽高, 除以default_boxes宽高
		wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [21824,2]  为什么会出现0宽度??
		wh = torch.log(wh) / variances[1] # Variable
		inf_flag = wh.abs() > 10000
		# print ('>>>>>>>>>', inf_flag.long().sum())
		if(inf_flag.long().sum() is not 0):
			print('inf_flag has true', wh, boxes)
			print('org_boxes', boxes_org)
			print('max_iou', max_iou, 'max_iou_index', max_iou_index)
			raise 'inf error'
		loc = torch.cat([cxcy, wh], 1) # [21824,4]
		conf = classes[max_index] #其实都是1 [21824,]
		conf[iou < threshold] = 0 #iou小的设为背景
		conf[max_iou_index] = 1 # 这么设置有问题,loc loss 会导致有inf loss,从而干扰训练,
								# 去掉后,损失降的更稳定些,是因为widerFace数据集里有的label
								# 做的宽度为0,但是没有被滤掉,是因为max(1)必须为每一个object选择一个
								# 与之对应的default_box,需要修改数据集里的label。
		# ('targets', Variable containing:
 		# 318.7500   -1.2500      -inf      -inf
		# org_boxes 0.1338  0.3801  0.1338  0.3801

		return loc,conf

	def nms(self,bboxes,scores,threshold=0.5):
		bboxes(tensor) [N,4]
		scores(tensor) [N,]
		x1 = bboxes[:,0]
		y1 = bboxes[:,1]
		x2 = bboxes[:,2]
		y2 = bboxes[:,3]
		areas = (x2-x1) * (y2-y1)

		_,order = scores.sort(0,descending=True)
		keep = []
		while order.numel() > 0:
			i = order[0]

			if order.numel() == 1:

			xx1 = x1[order[1:]].clamp(min=x1[i])
			yy1 = y1[order[1:]].clamp(min=y1[i])
			xx2 = x2[order[1:]].clamp(max=x2[i])
			yy2 = y2[order[1:]].clamp(max=y2[i])

			w = (xx2-xx1).clamp(min=0)
			h = (yy2-yy1).clamp(min=0)
			inter = w*h

			ovr = inter / (areas[i] + areas[order[1:]] - inter)
			ids = (ovr<=threshold).nonzero().squeeze()
			if ids.numel() == 0:
			order = order[ids+1]
		return torch.LongTensor(keep)

	def decode(self,loc,conf):
		將预测出的 loc/conf转换成真实的人脸框
		loc [21842,4]
		conf [21824,2]
		# encode 部分的返操作。 
		variances = [0.1, 0.2]
		cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
		wh  = torch.exp(loc[:,2:] * variances[1]) * self.default_boxes[:,2:]
		boxes = torch.cat([cxcy-wh/2,cxcy+wh/2],1) #[21824,4]
		conf[:,0] = 0.4

		max_conf, labels = conf.max(1) #[21842,1]
		# print(max_conf)
		# print('labels', labels.long().sum())
		if labels.long().sum() is 0:
			sconf, slabel = conf.max(0)
			max_conf[slabel[0:5]] = sconf[0:5]
			labels[slabel[0:5]] = 1

		ids = labels.nonzero().squeeze(1)
		# print('ids', ids)
		# print('boxes', boxes.size(), boxes[ids])

		keep = self.nms(boxes[ids],max_conf[ids])#.squeeze(1))
        # 返回的boxes是归一化后的结果
		return boxes[ids][keep], labels[ids][keep], max_conf[ids][keep]

if __name__ == '__main__':
	dataencoder = DataEncoder()
	# dataencoder.test_iou()
	# print((dataencoder.default_boxes))
	boxes = torch.Tensor([[-8,-8,24,24],[400,400,500,500]])/1024



