需要注意的地方是,观察模型的主干网络,与 forword 层存在几个输出,若存在多个输出,
import os
import sys
import time
import cv2
import torch
import utils
import hopenet
import argparse
import torchvision
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import numpy as np
from PIL import Image
from torchvision import transforms
from torch.autograd import Variable
from pytorch_grad_cam import GradCAM, ScoreCAM, GradCAMPlusPlus
from pytorch_grad_cam.utils.image import preprocess_image, show_cam_on_image
t = time.time()
# 环境参数设置:
def parse_args():
"""Parse input arguments."""
# use 'default = ' to change parameters
parser = argparse.ArgumentParser(description = 'Head pose estimation using the Hopenet network.')
# gpu
parser.add_argument('--gpu', dest = 'gpu_id', help = 'GPU device id to use [0]',
default = 0, type = int)
# use gpus
parser.add_argument('--use-cuda', action = 'store_true', default = True,
help = 'Use NVIDIA GPU acceleration')
# path to model
parser.add_argument('--snapshot', dest = 'snapshot', help = 'Path of model snapshot.',
default = 'E://Paper/Reader/Head_pose/Deep-head-pose/pre_models/hopenet_robust_alpha1.pkl',
type = str)
# picture path
parser.add_argument('--picture', dest = 'picture_path', help = 'Path of picture', default = 'E://Paper/Reader/'
# bbox of picture
parser.add_argument('--bboxes', dest = 'bboxes', help = 'Bounding box annotations of frames', default = 'E://Paper/'
# aug smooth
parser.add_argument('--aug_smooth', action = 'store_true',
help = 'Apply test time augmentation to smooth the CAM')
# eigen smooth
parser.add_argument('--eigen_smooth', action = 'store_true',
help = 'Reduce noise by taking the first principle componenet of cam_weights*activations')
parser.add_argument('--method', type = str, default = 'gradcam',
choices = ['gradcam', 'gradcam++', 'scorecam'], help = 'Can be gradcam / gradcam++ /scorecam')
args = parser.parse_args()
# cuda提示符
args.use_cuda = args.use_cuda and torch.cuda.is_available()
if args.use_cuda:
print("Using GPU for acceleration")
print("Using CPU for computation")
return args
if __name__ == '__main__':
args = parse_args()
# methods to show hot map
methods = \
{"gradcam": GradCAM,
"scorecam": ScoreCAM,
"gradcam++": GradCAMPlusPlus
# use cudnn
cudnn.enabled = True
# bitch size
batch_size = 32
# get gpu list
gpu = args.gpu_id
# explanation of the model path
snapshot_path = args.snapshot
# path to picture
out_dir = 'output/pictures'
# path to read pictures
picture_path = args.picture_path
if args.method not in list(methods.keys()):
raise Exception(f"method should be one of {list(methods.keys())}")
# if directory of save not exist, create one
if not os.path.exists(out_dir):
# report an error if video path not exist
if not os.path.exists(args.picture_path):
sys.exit('picture does not exist')
# ResNet50 structure
model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
print('Loading snapshot.')
# Load snapshot
saved_state_dict = torch.load(snapshot_path)
print('Loading data.')
# transforms of the pictures
transformations = transforms.Compose([transforms.Scale(224), # resize
transforms.CenterCrop(224), transforms.ToTensor(), # center crop
transforms.Normalize(mean = [0.485, 0.456, 0.406], # mean,std of (R, G, B)
std = [0.229, 0.224, 0.225])])
model.cuda(gpu) # transform to gpu
print('Ready to test network.')
# Test the Model
model.eval() # Change model to 'eval' mode (BN uses moving mean/var).
total = 0
idx_tensor = [idx for idx in range(66)]
idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
# read picture
picture = cv2.imread(picture_path)
width = int(picture.shape[1]) # shape[0]返回图片的高度
height = int(picture.shape[0]) # shape[1]返回图片的宽度
print('width', width)
print('height', height)
# Define the codec and create VideoWriter object
# fourcc = cv2.VideoWriter_fourcc(*'MJPG')
# rename outputs
# out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))
# 采用热图可视化模型
# model_1 = hopenet.ResNet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 1000)
# model_1.eval()
# target_layer = model_1.layer3[-1]
# print('the target layer:', target_layer)
# cam = methods[args.method](model = model_1,
# target_layer = target_layer,
# use_cuda = args.use_cuda, )
t = time.time()
idx = 0
count = 0
# show the box bounding, try from one picture
with open(args.bboxes, 'r') as f:
# return the list contain all ele exp \n ''
bbox_line_list = f.read().split() # use split instead of splitlines
print('bbox_line_list', bbox_line_list)
print('length', len(bbox_line_list))
# when list is not null, do:
while len(bbox_line_list) > 0:
line = bbox_line_list
# print('line', line)
# 加上跳出循环语句
pic_number = line[0]
# print('This is the No.%s picture' % pic_number)
cv2_pic = cv2.cvtColor(picture, cv2.COLOR_BGR2RGB) # 将读取到的当前图片转换为RGB
# start dict pose
# 实验
# print("Start detecting pose ...")
# 利用 idx下标实现对于单图多人框架结构的添加
x_min, y_min, x_max, y_max = int(float(line[idx + 1])), int(float(line[idx + 2])), int(
float(line[idx + 3])), int(
float(line[idx + 4]))
# print('bbox1', x_min, y_min, x_max, y_max)
bbox_width = abs(x_max - x_min)
bbox_height = abs(y_max - y_min)
x_min -= 50
x_max += 50
y_min -= 50
y_max += 30
x_min = max(x_min, 0)
y_min = max(y_min, 0)
x_max = min(picture.shape[1], x_max)
y_max = min(picture.shape[0], y_max)
# Crop face loosely
img = cv2_pic[y_min:y_max, x_min:x_max] # 保持长宽比例不变性
img = Image.fromarray(img) # 将 array 转化为 Image
# Transform
img = transformations(img)
img_shape = img.size()
img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
img = Variable(img).cuda(gpu)
# capture the every frame of video, then define the three euler angles
# 从hopenet网络传递回三种角度
yaw, pitch, roll = model(img)
# # 传入加上人物位置信息的图片参数:
# 对于人物分割部分的热图分析:
# rgb_img = cv2_pic[y_min:y_max, x_min:x_max]
# rgb_img = cv2.resize(rgb_img, (256, 256))
# rgb_img = np.float32(rgb_img) / 255
# input_tensor = preprocess_image(rgb_img, mean = [0.485, 0.456, 0.406],
# std = [0.229, 0.224, 0.225])
# print(input_tensor)
# target_category = None
# cam.batch_size = 32
# # use aug_smooth and eigen_smooth to smooth th vision
# grayscale_cam = cam(input_tensor = input_tensor,
# target_category = target_category,
# aug_smooth = args.aug_smooth,
# eigen_smooth = args.eigen_smooth)
# grayscale_cam = grayscale_cam[0, :]
# cam_image = show_cam_on_image(rgb_img, grayscale_cam)
# cv2.imwrite(f'{args.method}_bbox_layer4_%s.jpg'%count, cam_image)
# print('the yaw', yaw) 此时的yaw,pitch,roll是张量
yaw_predicted = F.softmax(yaw)
pitch_predicted = F.softmax(pitch)
roll_predicted = F.softmax(roll)
# Get continuous predictions in degrees.
# 对张量内的元素进行求和操作;
yaw_predicted = torch.sum(yaw_predicted.data * idx_tensor) * 3 - 99
pitch_predicted = torch.sum(pitch_predicted.data * idx_tensor) * 3 - 99
roll_predicted = torch.sum(roll_predicted.data * idx_tensor) * 3 - 99
# utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2,
# (y_min + y_max) / 2, size = bbox_width)
# 在原图的基础上,画出与头部姿态相关的三种角度
utils.draw_axis(picture, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2,
tdy = (y_min + y_max) / 2, size = bbox_height / 2)
# 通过输出三个角度发现,数据类型为tensor张量类型,不可用round方法
# print('the sum of yaw', yaw_predicted)
# print('the sum of pitch', pitch_predicted)
# print('the sum of roll', roll_predicted)
# Plot expanded bounding box
# cv2.rectangle(picture, (x_min, y_min), (x_max, y_max), (0, 255, 0), 1)
# 显现出头部姿态的三种角度;
# 利用'%.2f'% 来控制输出的位数
# 或者 print('{:.2f}'.format(num))
cv2.putText(picture, f"Yaw: {'%.2f' % yaw_predicted}", (x_min + 45, y_min + 30), cv2.FONT_HERSHEY_COMPLEX_SMALL,
0.5, (0, 255, 0), 1)
cv2.putText(picture, f"Pitch: {'%.2f' % pitch_predicted}", (x_min + 45, y_min + 40),
cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (0, 255, 0), 1)
cv2.putText(picture, f"Roll: {'%.2f' % roll_predicted}", (x_min + 45, y_min + 50),
cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (0, 255, 0), 1)
# 按照原来的长宽,保存图片
# 。。。路径错误导致卡了半天
cv2.imwrite('E://Paper/Reader/Head_pose/Deep-head-pose/code/output/pictures/output16.jpg', picture)
cv2.imshow('picture', picture)
# cv2.waitKey(0)
count += 1
idx += 4
if idx + 4 > len(line):
# 跳出循环
# # 对于加上位置信息后整体的热图
# rgb_img = picture
# rgb_img = cv2.resize(rgb_img, (256, 256))
# rgb_img = np.float32(rgb_img) / 255
# input_tensor = preprocess_image(rgb_img, mean = [0.485, 0.456, 0.406],
# std = [0.229, 0.224, 0.225])
# print(input_tensor)
# target_category = None
# cam.batch_size = 32
# # use aug_smooth and eigen_smooth to smooth th vision
# grayscale_cam = cam(input_tensor = input_tensor,
# target_category = target_category,
# aug_smooth = args.aug_smooth,
# eigen_smooth = args.eigen_smooth)
# grayscale_cam = grayscale_cam[0, :]
# cam_image = show_cam_on_image(rgb_img, grayscale_cam)
# cv2.imwrite(f'{args.method}_bbox_layer3.jpg', cam_image)
print('time taken=', time.time() - t)