____公司原先的网络部署都是直接python调用网络,然后用flask与后台对接,效率实在太低,一直都是个心病。国庆节前无心工作,所以就把这个事情拿出来操练了一遍,尝试用openVINO部署,在此记录下流程,后续准备把公司的网络全转成openVINO可以处理格式。之所以选择openVINO,是看中了CPU加速,支持边缘设备。一般客户都不乐意项目重新购置GPU,明明有一堆电脑闲置了,能重新利用起来直接CPU跑网络,何乐不为,而且通常来说,也没有实时性处理要求,CPU推理速度也够了。
____以人脸为例吧,写个关键点5点检测网络,然后转成openVINO推理,在这我就默认大家openVINO已经配置好了。
版本记录:openVINO–>openvino_2020.4.287,torch–>1.3.1,系统–>win10
1、数据集准备
____数据集采用的开源的名人数据集,啥名字忘了公司系统拷贝下来的,数据如下:
这里再用 DBFace网络做个人脸检测,裁切成大头贴,选取了2000张,重新保存如下:
好了,现在有了原始图像了。因为只是做流程测试,人脸五点我就没自己去标注了,也是使用DBFace输出的5点信息,保存成了一个txt,代码如下:
def image_demo(path = 'faces_cutted/'):
dbface = DBFace()
dbface.eval()
if HAS_CUDA:
dbface.cuda()
dbface.load("model/dbface.pth")
for pic_name in os.listdir(path):
print(pic_name)
image = cv2.imread(path + pic_name, flags=4)
objs = detect(dbface, image, threshold=0.5, nms_iou=0.5)
with open('landmarks.txt', 'a') as f:
if len(objs) > 0:
f.write('{} '.format('faces_cutted/'+pic_name))
for obj in objs:
x_LU, y_LU, x_RD, y_RD = common.intv(obj.box)
# cv2.rectangle(image, (x, y, r - x + 1, b - y + 1), (0, 0, 255), 2, 16)
if obj.haslandmark:
for i in range(len(obj.landmark)):
x_p, y_p = obj.landmark[i][:2]
f.write('{0} {1} '.format(int(x_p), int(y_p)))
# cv2.circle(image, common.intv(x_p, y_p), 3, (0, 0, 255), -1, 16)
f.write('\n')
break
保存后的txt标签信息如下:
faces_cutted/000001.jpg 4 47 54 44 25 79 17 101 55 98
faces_cutted/000002.jpg 4 44 53 42 27 67 14 92 53 90
faces_cutted/000003.jpg 60 49 86 50 98 75 66 111 87 113
faces_cutted/000004.jpg 59 65 86 63 88 89 58 111 82 109
faces_cutted/000005.jpg 23 46 67 48 46 64 28 84 65 85
faces_cutted/000006.jpg 27 46 73 41 58 67 35 90 74 86
faces_cutted/000007.jpg 16 45 64 43 42 74 23 91 64 88
faces_cutted/000008.jpg 13 39 55 40 30 70 16 84 53 85
faces_cutted/000009.jpg 32 54 81 52 62 83 37 100 78 97
faces_cutted/000010.jpg 21 45 69 45 47 74 25 90 66 89
faces_cutted/000011.jpg 23 42 70 42 50 71 27 89 68 87
faces_cutted/000012.jpg 18 46 68 43 46 72 25 95 70 91
依次为图像路径、5点信息(左眼、右眼、鼻子、左嘴角、右嘴角)。
import torch
import cv2
import numpy as np
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torchvision.models as models
class FaceLandmarksDataset(Dataset):
def __init__(self, txt_file):
self.transform = transforms.Compose([transforms.ToTensor()])
lines = []
with open(txt_file) as read_file:
for line in read_file:
# line = line.replace('\n', '')
lines.append(line)
self.landmarks_frame = lines
def __len__(self):
return len(self.landmarks_frame)
def num_of_samples(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
contents = self.landmarks_frame[idx].split(' ')
image_path = 'data/'+contents[0]
img = cv2.imread(image_path) # BGR order
h, w, c = img.shape
# rescale
img = cv2.resize(img, (64, 64))
img = (np.float32(img) /255.0 - 0.5) / 0.5
landmarks = np.zeros(10, dtype=np.float32)
for i in range(1, len(contents)-1, 2):
landmarks[i - 1] = np.float32(contents[i]) / w
landmarks[i] = np.float32(contents[i + 1]) / h
landmarks = landmarks.astype('float32').reshape(-1, 2)
# H, W C to C, H, W
img = img.transpose((2, 0, 1))
sample = {'image': torch.from_numpy(img), 'landmarks': torch.from_numpy(landmarks)}
return sample
class ChannelPool(torch.nn.Module):
def __init__(self, kernel_size=7, stride=2, padding=3, dilation=1,
return_indices=False, ceil_mode=False):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
self.padding = padding
self.dilation = dilation
self.return_indices = return_indices
self.ceil_mode = ceil_mode
self.compression = 2
self.output = None
def forward(self, input):
n, c, w, h = input.size()
# Add padding to input so work with kernal size
input = torch.nn.functional.pad(input, (0, 0, 0, 0, self.padding, self.padding), "constant", 0)
# Get output
output = torch.stack([
torch.stack(
[torch.max(input[x][index:index + self.kernel_size - 1], axis=0)[0]
# Get max at each position in kernal size
for index in range(0, input.size()[1] - self.kernel_size, self.stride)]) # Move stride
for x in range(n)]) # Do work for each image in batch
return output.cuda()
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.cnn_layers = torch.nn.Sequential(
# in_channels=1, out_channels=8, kernel_size=3, padding=1, stride=1
torch.nn.Conv2d(3, 16, 3, 1), #16*64*64
torch.nn.Conv2d(16, 32, 3), #32*62*62
torch.nn.BatchNorm2d(32),
torch.nn.PReLU(),
torch.nn.MaxPool2d(2,2), #32*31*31
torch.nn.Conv2d(32, 64, 3, padding=1), #64*31*31
torch.nn.Conv2d(64, 64, 3, padding=1), #64*31*31
torch.nn.BatchNorm2d(64),
torch.nn.PReLU(),
torch.nn.MaxPool2d(2, 2), #64*15*15
torch.nn.Conv2d(64, 128, 3, padding=1), #128*15*15
torch.nn.Conv2d(128, 128, 3, padding=1), #128*15*15
torch.nn.BatchNorm2d(128),
torch.nn.PReLU(),
torch.nn.MaxPool2d(2, 2) #128*7*7
)
self.dw_max = ChannelPool(128, 8*8)
self.fc = torch.nn.Linear(49, 10)
def forward(self, x):
x = self.cnn_layers(x)
x = self.dw_max(x)
x = x.view(-1, 1 * 7 * 7)
out = self.fc(x)
return out
class loss_fn(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, pred, label):
pred = pred.view(-1, 5, 2)
label = label.view(-1, 5, 2)
left_eye = label[:, 0, :]
right_eye = label[:, 1, :]
dist = torch.sqrt(torch.sum((pred - label) ** 2, dim=2))
eye_dist = torch.sqrt(torch.sum((left_eye - right_eye) ** 2))
return torch.sum(dist / eye_dist) / 5
if __name__ == '__main__':
#----------------------------------------- 1、training --------------------------------------------
model = Net()
if torch.cuda.is_available():
model.cuda()
ds = FaceLandmarksDataset("D:/pytorch/data/landmarks.txt")
num_train_samples = ds.num_of_samples()
# for i in range(len(ds)):
# sample = ds[i]
# print(i, sample['image'].size(), sample['landmarks'].size())
# if i == 3:
# break
dataloader = DataLoader(ds, batch_size=16, shuffle=True, num_workers=4)
# for i_batch, sample_batched in enumerate(dataloader):
# print(i_batch, sample_batched['image'].size(), sample_batched['landmarks'].size())
# if i_batch == 3:
# break
num_epochs = 50
accumulation_steps = 4
opeimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
landmark_loss_fn = loss_fn()
model.train()
for epoch in range(num_epochs):
train_loss = 0.0
for i_batch, sample_batched in enumerate(dataloader):
image_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
if torch.cuda.is_available():
image_batch, landmarks_batch = image_batch.cuda(),landmarks_batch.cuda()
output = model(image_batch)
loss = landmark_loss_fn(output, landmarks_batch) / accumulation_steps
loss.backward()
if ((i_batch + 1) % accumulation_steps) == 0:
opeimizer.step()
opeimizer.zero_grad()
train_loss += loss.item()
train_loss = train_loss / num_train_samples
print('Epoch: {} \tTraining Loss: {:.6f} '.format(epoch, train_loss))
model.eval()
torch.save(model, 'model_landmarks.pth')
# #----------------------------------------- 2、test --------------------------------------------
# img = cv2.imread('data/face_test.png')
# model = torch.load('model_landmarks.pth')
# img = cv2.resize(img, (64, 64))
# img = (np.float32(img) / 255.0 - 0.5) / 0.5
# img = img.transpose((2, 0, 1))
# x_input = torch.from_numpy(img).view(1, 3, 64, 64)
# model.cuda()
# probs = model(x_input.cuda())
# lm_pts = probs.view(5, 2).cpu().detach().numpy()
# print(
# lm_pts
# )
#
代码很好理解了,常规流程,加载数据集、训练,随便跑了下就收敛了,保存torch模型。注意输入图像全部归一化到64*64大小,后面转openVINO要对应上。
3、转IR格式
两步走,1、pth格式转ONNX,2、ONNX转IR,实际上做起来很简单。
pth转ONNX,注意指定输入尺寸,代码如下:
#----------------------------------------- 3、to ONNX --------------------------------------------
model = torch.load('model_landmarks.pth')
model.eval()
model.cuda()
dummy_input = torch.randn(1, 3, 64, 64, device='cuda')
torch.onnx.export(model, dummy_input, "model_landmark.onnx", output_names={"output"}, verbose = True)
然后就生成了model_landmark.onnx文件,再转IR。
管理员模式打开win10终端,cd进openVINO转换工具目录,执行转换代码:
管理员CMD $:cd C:\Program Files (x86)\IntelSWTools\openvino_2020.4.287\deployment_tools\model_optimizer
执行转换 $:python mo_onnx.py --input_model D:\pytorch\model_landmark.onnx
看见结果如下:
这就是转换成功了,在转换工具目录下生成了bin和xml文件,然后就可以用openVINO部署了。转换时候也可以使用data_tpye参数进行量化,比如:
python mo_onnx.py --input_model D:\pytorch\model_landmark.onnx --data_type FP16
转换后的模型只有大概500K,直接相比全精度缩减一半。还有其他神奇操作,优点太多了,反正我以后就用openVINO了。