Dataset
dataset定义根据数据集格式的不同各有变化,但大体结构是一致的,由读取数据集信息,转换格式再迭代输出。如若数据集能被转换为一个txt文件,可能再其他方面有些劣势,但这会让dataset类变得异常简单。如下所示,已经将带有人脸框以及关键点标注信息转为一行数据,以空格分隔。当然这是最为简单的情况。
class WLFWDatasets(data.Dataset):
def __init__(self, file_list, transforms=None):
self.line = None
self.path = None
self.landmarks = None
self.attribute = None
self.filenames = None
self.euler_angle = None
self.transforms = transforms
with open(file_list, 'r') as f:
self.lines = f.readlines()
def __getitem__(self, index):
self.line = self.lines[index].strip().split()
self.img = cv2.imread(self.line[0])
self.landmark = np.asarray(self.line[1:197], dtype=np.float32)
self.attribute = np.asarray(self.line[197:203], dtype=np.int32)
self.euler_angle = np.asarray(self.line[203:206], dtype=np.float32)
if self.transforms:
self.img = self.transforms(self.img, self.landmark)
return (self.img, self.landmark, self.attribute, self.euler_angle)
def __len__(self):
return len(self.lines)
Transform
除却分类或分割任务,CNN其他任务大多需要自行编写transform类以进行数据增强,因为随着对图片的改动,标签也需要随之变更。
1.compose / totensor
典中典,Compose即顺序进行数据处理,ToTensor通常为最后(不考虑利用已有信息中心化)或倒数第二步,即将numpy(一般情况) 转化为torch.Tensor类以进行后续推理。
class Compose(object):
"""组合多个transform函数"""
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, image, target):
for t in self.transforms:
image, target = t(image, target)
return image, target
class ToTensor(object):
"""将PIL图像转为Tensor"""
def __call__(self, image, target):
image = F.to_tensor(image)
return image, target
2.random_flip
random通常作为前缀,因为随机的对数据进行处理会增强数据多样性,若是恒定处理增强数据的作用会大大减少。水平翻转是较为简单的处理方式,即水平方向中心轴左右两边进行交换。关键点xnew = 1 - xold(归一化后) ,图片则简单调库。
class Random_hflip(object):
"""随机左右翻转"""
def __init__(self, p):
self.p = p
def __call__(self, image, landmarks):
if random.random() < self.p:
image1 = np.ascontiguousarray(np.flip(image, axis=[1]))
new_landmarks = []
landmark_xy = landmarks.reshape(-1,2)
for (x, y) in landmark_xy:
new_landmarks.append(1-x)
new_landmarks.append(y)
return image, np.asarray(landmarks)
3.random_noise
同理,随机来看是否给图片增加噪声,即比如限定一个范围如0-0.2,对生成一个矩阵,宽高与原图相同,每个像素值再0-0.2*255之间,随便将其与原图相加或减去,最后再flip一下即可。
同理比如亮度调整,只不过是整体乘以个1.2或0.9,大同小异。
class Random_Noise(object):
"""随机增加噪声"""
def __init__(self, p, limit):
self.p = p
self.limit = limit
def __call__(self, image, landmarks):
if random.random() < self.p:
noise = np.random.uniform(0, self.limit, size=(image.shape[0], image.shape[1])) * 255
if random.random()>0.5:
image += noise[:, :, np.newaxis].astype(np.uint8)
else:
image -= noise[:, :, np.newaxis].astype(np.uint8)
image = np.clip(image, 0, 255)
return image, landmarks
4.random_rotation
无论何时,旋转永远是重头戏。此处有一点设计,能最大程度保留图像的特征信息,同时也能进行旋转操作。
class Random_Rotation(object):
"""随机旋转"""
def __init__(self, p, max_alpha):
self.p = p
self.alpha = max_alpha
def __call__(self, image, landmarks):
if random.random() < self.p:
a = int(random.random() * self.alpha)
rows, cols = image.shape[:2]
# M是2*3 旋转+平移矩阵 2*2负责旋转 2*1负责平移
# 所以将[x, y, 1]与之矩阵乘法 得出[x_new, y_new]
M = cv2.getRotationMatrix2D((cols/2, rows/2), a, 1)
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
# 首先计算出旋转后图片最大外接矩形
new_w = rows * sin + cols * cos
new_h = rows * cos + cols * sin
# 刚说明M最后一列是平移量,旋转不边,只需要平移量增减即可保证原图中心也是新图中心
M[0, 2] += (new_w - cols) * 0.5
M[1, 2] += (new_h - rows) * 0.5
w = int(np.round(new_w))
h = int(np.round(new_h))
landmarks = landmarks.reshape(-1,2)
# 图像旋转, 旋转后大小为外接矩形大小
img_rotated_by_alpha = cv2.warpAffine(image, M, (w, h))
x_ = (w - cols)/2
y_ = (h - rows)/2
newlandmarks = []
for x,y in landmarks:
x = x*cols
y = y*rows
newlandmarks.append(x)
newlandmarks.append(y)
pt = np.asarray(newlandmarks).reshape(-1,2)
ones = np.ones((pt.shape[0], 1), dtype=float)
pt = np.concatenate([pt, ones], axis=1).T
new_pt = np.dot(M, pt).T
new_pt = (new_pt * np.array([1/w,1/h])) .flatten()
new_image = cv2.resize(img_rotated_by_alpha,(112,112))
return new_image, new_pt
Loss
关于loss其实并不难,pfld原论文网络的巧思在于增加了辅助网络,利用已经给定的roll-pitch-yaw三角度,通过辅助网络的输出,于loss中起到作用,并回传影响主干网络输出。
主要起作用的是weighted_loss即利用已有属性标签和角度计算得出的误差。
class PFLDLoss(nn.Module):
def __init__(self):
super(PFLDLoss, self).__init__()
def forward(self, attribute_gt, landmark_gt, euler_angle_gt, angle,
landmarks, train_batchsize):
# weight_angle: batch 1
weight_angle = torch.sum(1 - torch.cos(angle - euler_angle_gt), axis=1)
# attributes_w_n: batch 5 例如[0,0,0,1,0]
attributes_w_n = attribute_gt[:, 1:6].float()
# mat_ratio: 1 5
mat_ratio = torch.mean(attributes_w_n, axis=0)
# mat_ratio 要么是64 要么64的2-5倍数之间
mat_ratio = torch.Tensor([
1.0 / (x) if x > 0 else train_batchsize for x in mat_ratio
]).to(device)
# mat_ratio 这一手mul首先让属性权重为0的没有享受到乘法,而且最后求和 但也可能会6个0
# weight_attribute: batch 1
weight_attribute = torch.sum(attributes_w_n.mul(mat_ratio), axis=1)
l2_distant = torch.sum(
(landmark_gt - landmarks) * (landmark_gt - landmarks), axis=1)
wingloss = wing_loss(landmark_gt, landmarks)
sm = smoothL1(landmark_gt, landmarks)
w = weight_angle * weight_attribute
weight = torch.where(w>0, w, 1)
weight_loss = weight* wingloss
return torch.mean(weight_loss), torch.mean(l2_distant)
Train
训练过程代码相对固定,即将处理好的数据输入网络,得到输出,利用输出计算loss,loss回传而后更新权重。
def train(train_loader, pfld_backbone, auxiliarynet, criterion, optimizer,
epoch):
losses = AverageMeter()
weighted_loss, loss = None, None
for img, landmark_gt, attribute_gt, euler_angle_gt in tqdm(train_loader):
img = img.to(device)
attribute_gt = attribute_gt.to(device)
landmark_gt = landmark_gt.to(device)
euler_angle_gt = euler_angle_gt.to(device)
pfld_backbone = pfld_backbone.to(device)
auxiliarynet = auxiliarynet.to(device)
features, landmarks = pfld_backbone(img)
angle = auxiliarynet(features)
# weighted_loss: wingloss, loss=L2loss * 2
weighted_loss, loss = criterion(attribute_gt, landmark_gt,
euler_angle_gt, angle, landmarks,
args.train_batchsize)
optimizer.zero_grad()
weighted_loss.backward()
optimizer.step()
losses.update(loss.item())
return weighted_loss, loss
Camera
最后的效果可以用数据集中的验证集来看,也可以通过电脑自带摄像头来看。其中主要步骤为:
1.人脸检测,
2.一定比例放大为正方形检测框
3.判断检测框是否超出边界
4.数据预处理放入关键点检测网络
5.输出关键点参数并画图。
checkpoint = torch.load(args.model_path, map_location=device)
pfld_backbone = PFLDInference().to(device)
pfld_backbone.load_state_dict(checkpoint['pfld_backbone'])
pfld_backbone.eval()
pfld_backbone = pfld_backbone.to(device)
transform = torchvision.transforms.Compose(
[torchvision.transforms.ToTensor()])
cap = cv2.VideoCapture(0)
while True:
ret, img = cap.read()
if not ret: break
height, width = img.shape[:2]
bounding_boxes, landmarks = detect_faces(img)
for box in bounding_boxes:
x1, y1, x2, y2 = (box[:4] + 0.5).astype(np.int32)
w = x2 - x1 + 1
h = y2 - y1 + 1
cx = x1 + w // 2
cy = y1 + h // 2
size = int(max([w, h]) * 1.3)
x1 = cx - size // 2
x2 = x1 + size
y1 = cy - size // 2
y2 = y1 + size
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(width, x2)
y2 = min(height, y2)
edx1 = max(0, -x1)
edy1 = max(0, -y1)
edx2 = max(0, x2 - width)
edy2 = max(0, y2 - height)
cropped = img[y1:y2, x1:x2]
if (edx1 > 0 or edy1 > 0 or edx2 > 0 or edy2 > 0):
cropped = cv2.copyMakeBorder(cropped, edy1, edy2, edx1, edx2,
cv2.BORDER_CONSTANT, 0)
input = cv2.resize(cropped, (112, 112))
start = time.time()
input = transform(input).unsqueeze(0).to(device)
_, landmarks = pfld_backbone(input)
pre_landmark = landmarks[0]
pre_landmark = pre_landmark.cpu().detach().numpy().reshape(
-1, 2) * [size, size] - [edx1, edy1]
end = time.time()
print('推理时间为: {}ms'.format((start-end)*1000))
for (x, y) in pre_landmark.astype(np.int32):
cv2.circle(img, (x1 + x, y1 + y), 1, (0, 0, 255))
cv2.imshow('face_landmark_68', img)
if cv2.waitKey(10) == 27:
break