利用Cam对Hopenet网络模型进行可视化

最新推荐文章于 2024-09-08 00:18:10 发布
南风不竞:
最新推荐文章于 2024-09-08 00:18:10 发布
阅读量450
点赞数 1
分类专栏：学习记录
本文链接：https://blog.csdn.net/qq_44001342/article/details/117716632
版权
学习记录专栏收录该内容
19 篇文章 4 订阅
订阅专栏
利用cam对论文中的模型中间层进行可视化操作，方面观察模型的注意点在哪。
需要注意的地方是，观察模型的主干网络，与 forword 层存在几个输出，若存在多个输出，
则改为一个
import os
import sys
import time
import cv2
import torch
import utils
import hopenet
import argparse
import torchvision
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import numpy as np

from PIL import Image
from torchvision import transforms
from torch.autograd import Variable
from pytorch_grad_cam import GradCAM, ScoreCAM, GradCAMPlusPlus
from pytorch_grad_cam.utils.image import preprocess_image, show_cam_on_image

t = time.time()


# 环境参数设置:
def parse_args():
    """Parse input arguments."""
    # use 'default = ' to change parameters
    parser = argparse.ArgumentParser(description = 'Head pose estimation using the Hopenet network.')
    # gpu
    parser.add_argument('--gpu', dest = 'gpu_id', help = 'GPU device id to use [0]',
                        default = 0, type = int)
    # use gpus
    parser.add_argument('--use-cuda', action = 'store_true', default = True,
                        help = 'Use NVIDIA GPU acceleration')
    # path to model
    parser.add_argument('--snapshot', dest = 'snapshot', help = 'Path of model snapshot.',
                        default = 'E://Paper/Reader/Head_pose/Deep-head-pose/pre_models/hopenet_robust_alpha1.pkl',
                        type = str)

    # picture path
    parser.add_argument('--picture', dest = 'picture_path', help = 'Path of picture', default = 'E://Paper/Reader/'
                                                                                                'Head_pose/Deep-head-pose/'
                                                                                                'input/320_2.jpg')
    # bbox of picture
    parser.add_argument('--bboxes', dest = 'bboxes', help = 'Bounding box annotations of frames', default = 'E://Paper/'
                                                                                                            'Reader/Head_pose/Deep-head-pose/bbox/320_3.txt')
    # aug smooth
    parser.add_argument('--aug_smooth', action = 'store_true',
                        help = 'Apply test time augmentation to smooth the CAM')
    # eigen smooth
    parser.add_argument('--eigen_smooth', action = 'store_true',
                        help = 'Reduce noise by taking the first principle componenet of cam_weights*activations')

    parser.add_argument('--method', type = str, default = 'gradcam',
                        choices = ['gradcam', 'gradcam++', 'scorecam'], help = 'Can be gradcam / gradcam++ /scorecam')

    args = parser.parse_args()
    # cuda提示符
    args.use_cuda = args.use_cuda and torch.cuda.is_available()
    if args.use_cuda:
        print("Using GPU for acceleration")
    else:
        print("Using CPU for computation")

    return args


if __name__ == '__main__':

    args = parse_args()
    # methods to show hot map
    methods = \
        {"gradcam": GradCAM,
         "scorecam": ScoreCAM,
         "gradcam++": GradCAMPlusPlus
         }
    # use cudnn
    cudnn.enabled = True
    # bitch size
    batch_size = 32
    # get gpu list
    gpu = args.gpu_id
    # explanation of the model path
    snapshot_path = args.snapshot

    # path to picture
    out_dir = 'output/pictures'

    # path to read pictures
    picture_path = args.picture_path

    if args.method not in list(methods.keys()):
        raise Exception(f"method should be one of {list(methods.keys())}")

    # if directory of save not exist, create one
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # report an error if video path not exist
    if not os.path.exists(args.picture_path):
        sys.exit('picture does not exist')

    # ResNet50 structure
    model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)

    print('Loading snapshot.')
    # Load snapshot

    saved_state_dict = torch.load(snapshot_path)
    model.load_state_dict(saved_state_dict)

    print('Loading data.')

    # transforms of the pictures
    transformations = transforms.Compose([transforms.Scale(224),  # resize
                                          transforms.CenterCrop(224), transforms.ToTensor(),  # center crop
                                          transforms.Normalize(mean = [0.485, 0.456, 0.406],  # mean,std  of (R, G, B)
                                                               std = [0.229, 0.224, 0.225])])

    model.cuda(gpu)  # transform to gpu

    print('Ready to test network.')

    # Test the Model
    model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
    total = 0

    idx_tensor = [idx for idx in range(66)]
    idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)

    # read picture
    picture = cv2.imread(picture_path)

    width = int(picture.shape[1])  # shape[0]返回图片的高度
    height = int(picture.shape[0])  # shape[1]返回图片的宽度
    print('width', width)
    print('height', height)

    # Define the codec and create VideoWriter object
    # fourcc = cv2.VideoWriter_fourcc(*'MJPG')
    # rename outputs
    # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))

    # 采用热图可视化模型
    # model_1 = hopenet.ResNet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 1000)
    # model_1.eval()
    # target_layer = model_1.layer3[-1]
    # print('the target layer:', target_layer)
    # cam = methods[args.method](model = model_1,
    #                            target_layer = target_layer,
    #                            use_cuda = args.use_cuda, )

    t = time.time()
    idx = 0
    count = 0
    # show the box bounding, try from one picture
    with open(args.bboxes, 'r') as f:
        # return the list contain all ele exp \n ''
        bbox_line_list = f.read().split()  # use split instead of splitlines
        print('bbox_line_list', bbox_line_list)
        print('length', len(bbox_line_list))

    # when list is not null, do:
    while len(bbox_line_list) > 0:

        line = bbox_line_list
        # print('line', line)
        # 加上跳出循环语句
        pic_number = line[0]
        # print('This is the No.%s picture' % pic_number)

        cv2_pic = cv2.cvtColor(picture, cv2.COLOR_BGR2RGB)  # 将读取到的当前图片转换为RGB

        # start dict pose
        # 实验
        # print("Start detecting pose ...")
        # 利用 idx下标实现对于单图多人框架结构的添加
        x_min, y_min, x_max, y_max = int(float(line[idx + 1])), int(float(line[idx + 2])), int(
            float(line[idx + 3])), int(
            float(line[idx + 4]))
        # print('bbox1', x_min, y_min, x_max, y_max)
        bbox_width = abs(x_max - x_min)
        bbox_height = abs(y_max - y_min)

        x_min -= 50
        x_max += 50
        y_min -= 50
        y_max += 30
        x_min = max(x_min, 0)
        y_min = max(y_min, 0)

        x_max = min(picture.shape[1], x_max)
        y_max = min(picture.shape[0], y_max)
        # Crop face loosely
        img = cv2_pic[y_min:y_max, x_min:x_max]  # 保持长宽比例不变性
        img = Image.fromarray(img)  # 将 array 转化为 Image

        # Transform
        img = transformations(img)
        img_shape = img.size()
        img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
        img = Variable(img).cuda(gpu)

        # capture the every frame of video, then define the three euler angles
        # 从hopenet网络传递回三种角度
        yaw, pitch, roll = model(img)

        # # 传入加上人物位置信息的图片参数:
        # 对于人物分割部分的热图分析:
        # rgb_img = cv2_pic[y_min:y_max, x_min:x_max]
        # rgb_img = cv2.resize(rgb_img, (256, 256))
        # rgb_img = np.float32(rgb_img) / 255
        # input_tensor = preprocess_image(rgb_img, mean = [0.485, 0.456, 0.406],
        #                                 std = [0.229, 0.224, 0.225])
        # print(input_tensor)
        #
        # target_category = None
        # cam.batch_size = 32
        # # use aug_smooth and eigen_smooth to smooth th vision
        # grayscale_cam = cam(input_tensor = input_tensor,
        #                     target_category = target_category,
        #                     aug_smooth = args.aug_smooth,
        #                     eigen_smooth = args.eigen_smooth)
        #
        # grayscale_cam = grayscale_cam[0, :]
        # cam_image = show_cam_on_image(rgb_img, grayscale_cam)
        # cv2.imwrite(f'{args.method}_bbox_layer4_%s.jpg'%count, cam_image)

        # print('the yaw', yaw) 此时的yaw，pitch，roll是张量
        yaw_predicted = F.softmax(yaw)
        pitch_predicted = F.softmax(pitch)
        roll_predicted = F.softmax(roll)

        # Get continuous predictions in degrees.
        # 对张量内的元素进行求和操作;
        yaw_predicted = torch.sum(yaw_predicted.data * idx_tensor) * 3 - 99
        pitch_predicted = torch.sum(pitch_predicted.data * idx_tensor) * 3 - 99
        roll_predicted = torch.sum(roll_predicted.data * idx_tensor) * 3 - 99

        # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2,
        # (y_min + y_max) / 2, size = bbox_width)
        # 在原图的基础上，画出与头部姿态相关的三种角度
        utils.draw_axis(picture, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2,
                        tdy = (y_min + y_max) / 2, size = bbox_height / 2)

        # 通过输出三个角度发现，数据类型为tensor张量类型，不可用round方法
        # print('the sum of yaw', yaw_predicted)
        # print('the sum of pitch', pitch_predicted)
        # print('the sum of roll', roll_predicted)

        # Plot expanded bounding box
        # cv2.rectangle(picture, (x_min, y_min), (x_max, y_max), (0, 255, 0), 1)

        # 显现出头部姿态的三种角度;
        # 利用'%.2f'% 来控制输出的位数
        # 或者 print('{:.2f}'.format(num))
        cv2.putText(picture, f"Yaw: {'%.2f' % yaw_predicted}", (x_min + 45, y_min + 30), cv2.FONT_HERSHEY_COMPLEX_SMALL,
                    0.5, (0, 255, 0), 1)
        cv2.putText(picture, f"Pitch: {'%.2f' % pitch_predicted}", (x_min + 45, y_min + 40),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (0, 255, 0), 1)
        cv2.putText(picture, f"Roll: {'%.2f' % roll_predicted}", (x_min + 45, y_min + 50),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (0, 255, 0), 1)

        # 按照原来的长宽，保存图片
        # 。。。路径错误导致卡了半天
        cv2.imwrite('E://Paper/Reader/Head_pose/Deep-head-pose/code/output/pictures/output16.jpg', picture)
        cv2.imshow('picture', picture)
        # cv2.waitKey(0)
        cv2.destroyAllWindows()

        count += 1
        idx += 4
        if idx + 4 > len(line):
            # 跳出循环
            break

    # # 对于加上位置信息后整体的热图
    # rgb_img = picture
    # rgb_img = cv2.resize(rgb_img, (256, 256))
    # rgb_img = np.float32(rgb_img) / 255
    # input_tensor = preprocess_image(rgb_img, mean = [0.485, 0.456, 0.406],
    #                                 std = [0.229, 0.224, 0.225])
    # print(input_tensor)
    #
    # target_category = None
    # cam.batch_size = 32
    # # use aug_smooth and eigen_smooth to smooth th vision
    # grayscale_cam = cam(input_tensor = input_tensor,
    #                     target_category = target_category,
    #                     aug_smooth = args.aug_smooth,
    #                     eigen_smooth = args.eigen_smooth)
    #
    # grayscale_cam = grayscale_cam[0, :]
    # cam_image = show_cam_on_image(rgb_img, grayscale_cam)
    # cv2.imwrite(f'{args.method}_bbox_layer3.jpg', cam_image)

    print('time taken=', time.time() - t)