【留点代码】将transformer运用到目标检测上来，通过debug了解模型的模型运算流程

最新推荐文章于 2023-10-17 14:43:08 发布

羞儿

最新推荐文章于 2023-10-17 14:43:08 发布

阅读量601

点赞数 2

分类专栏：深度学习文章标签： transformer 目标检测深度学习 resnet50 阅读代码

本文链接：https://blog.csdn.net/weixin_43424450/article/details/125982255

版权

目标检测 Transformer DETR ResNet-50 ECCV2020

关键词由CSDN通过智能技术生成

深度学习专栏收录该内容

41 篇文章 3 订阅

订阅专栏

End-to-End Object Detection with Transformers

Facebook提出的基于Transformer的端到端目标检测网络，发表于ECCV2020，代码已开源：facebookresearch/detr： End-to-End Object Detection with Transformers (github.com)，这里留存的是一个简化版的运用detr训练好的权重文件对图像进行推理检测的模型。
detr模型构造函数

    def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()
        # create ResNet-50 backbone，从torchvision.models中导入了resnet50作为特征提取器，所以不需要全连接进行图像分类计算
        self.backbone = resnet50()
        del self.backbone.fc
        # create conversion layer，通过卷积操作实现2048维数据实现到256维的投射，使用256个1*1*2048的卷积核
        self.conv = nn.Conv2d(2048, hidden_dim, 1)
        # 从torch import nn中引入transformer模型，并将设置的超参数传入到transformer模型中
        self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)      # 做类别的预测,输出对于每个类别的置信度，还有一个未知类别
        self.linear_bbox = nn.Linear(hidden_dim, 4)                     # 做置信框的预测，输出的是个1*4的向量分别是【x1,y1,x2,y2】
        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim)) # 会生成100个置信框,前提设置为“一张图中不会超过100个物体需要检测”
        # spatial positional encodings ，空间位置编码
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        # note that in baseline DETR we use sine positional encodings，注意，在基线DETR中，我们使用正弦位置编码

detr前向传播函数

def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)     # nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)        # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        # convert from 2048 to 256 feature planes for the transformer，将transformer的2048个要素平面转换为256个要素平面
        h = self.conv(x)

        # construct positional encodings，构造位置编码
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)

        # propagate through the transformer  通过transformer传播
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                             self.query_pos.unsqueeze(1)).transpose(0, 1)

        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}

关于detr的前向推到可以参考学习：https://www.bilibili.com/video/BV1GB4y1X72R中30:00开始
本程序detr模型是以Resnet50作为特征提取器。 Resnet的BasicBlock前向传导：

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

加载训练的权重文件

state_dict = torch.hub.load_state_dict_from_url( # 加载的是权重文件，load_dict,第一次是需要下载的，后面运行如果不删除可直接在本机硬盘加载
    url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
    map_location='cpu', check_hash=True)
detr.load_state_dict(state_dict)

可通过阅读权重文件中的维度变化进一步理解模型的运算操作，以及每一层中的一些属性的包含，变化

检测函数

def detect(im, model, transform):
    # mean-std normalize the input image (batch-size: 1)
    img = transform(im).unsqueeze(0)
    # 演示模型默认仅支持纵横比在0.5到2之间的图像
    # 如果要使用纵横比在此范围之外的图像
    # 重新缩放图像，使最大大小为1333，以获得最佳效果
    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'
    # propagate through the model,将预处理好的图片传入到模型中进行计算
    outputs = model(img)
    # keep only predictions with 0.7+ confidence，仅保留0.7+置信度的预测，当置信度下调可能会检查如更多的框
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.7
    # 将方框从[0；1]转换为图像比例
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    return probas[keep], bboxes_scaled  #返回置信度张量，执行框在原图的标注信息

完整代码colab，可改动162-165行的代码推理检测自己的图片。

from PIL import Image
# 本程序可以联网下载数据（少量推理图片），模型文件，导入request包方便下载
import requests
import matplotlib.pyplot as plt

# 使用 %config InlineBackend.figure_format = 'retina' 在 %matplotlib inline后呈现更高分辨率的图像。
# %config InlineBackend.figure_format = 'retina'

import torch
from torch import nn
# backbone为resnet50,提取特征
from torchvision.models import resnet50
import torchvision.transforms as T
# 推理过程，不用设计到梯度的变化，设置为False可以节省内存，保留源模型文件
torch.set_grad_enabled(False)


class DETRdemo(nn.Module):
    """
    Demo DETR implementation.

    Demo implementation of DETR in minimal number of lines, with the
    following differences wrt DETR in the paper:
    * learned positional encoding (instead of sine)
    * positional encoding is passed at input (instead of attention)
    * fc bbox predictor (instead of MLP)
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Only batch size 1 supported.
    """
    # detr的构造函数，默认编解码器的个数各为6个，多头自注意力为8，像coco数据集的num_classes中有91个类
    def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()

        # create ResNet-50 backbone，从torchvision.models中导入了resnet50作为特征提取器，所以不需要全连接进行图像分类计算
        self.backbone = resnet50()
        del self.backbone.fc

        # create conversion layer，通过卷积操作实现2048维数据实现到256维的投射，使用256个1*1*2048的卷积核
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer，从torch import nn中引入transformer模型，并将设置的超参数传入到transformer模型中
        self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)      # 做类别的预测,输出对于每个类别的置信度，还有一个未知类别
        self.linear_bbox = nn.Linear(hidden_dim, 4)                     # 做置信框的预测，输出的是个1*4的向量分别是【x1,y1,x2,y2】

        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))      # 会生成100个置信框

        # spatial positional encodings ，空间位置编码
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        # note that in baseline DETR we use sine positional encodings，注意，在基线DETR中，我们使用正弦位置编码


    def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)     # nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)        # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        # convert from 2048 to 256 feature planes for the transformer，将transformer的2048个要素平面转换为256个要素平面
        h = self.conv(x)

        # construct positional encodings，构造位置编码
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)

        # propagate through the transformer  通过transformer传播
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                             self.query_pos.unsqueeze(1)).transpose(0, 1)

        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}


detr = DETRdemo(num_classes=91) # coco数据集，80个目标类别（object categories：行人、汽车、大象等），91种材料类别（stuff categories：草、墙、天空等）
state_dict = torch.hub.load_state_dict_from_url( # 加载的是权重文件，load_dict
    url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
    map_location='cpu', check_hash=True)
detr.load_state_dict(state_dict)
# 不启用 BatchNormalization 和 Dropout，保证BN和dropout不发生变化，pytorch框架会自动把BN和Dropout固定住，不会取平均，而是用训练好的值，不然的话，一旦test的batch_size过小，很容易就会被BN层影响结果。
detr.eval()
# train() 启用 BatchNormalization 和 Dropout
# 在利用原始.pth模型进行前向推理之前，一定要先进行model.eval()操作，不启用 BatchNormalization 和 Dropout。


# COCO classes，实体单词有80个，N/A有11个
CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),      # 将输入图片resize成800*800
    T.ToTensor(),       # 将img格式转化为tensor
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])   # 标准化，图片的平均值和方差值来自于imagenet
])

# for output bounding box post-processing，用于输出边界框后处理
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)     # 移除指定维后，返回一个元组，包含了沿着指定维切片后的各个切片。
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def detect(im, model, transform):
    # mean-std normalize the input image (batch-size: 1)
    img = transform(im).unsqueeze(0)

    # demo model only support by default images with aspect ratio between 0.5 and 2，演示模型默认仅支持纵横比在0.5到2之间的图像
    # if you want to use images with an aspect ratio outside this range，如果要使用纵横比在此范围之外的图像
    # rescale your image so that the maximum size is at most 1333 for best results，重新缩放图像，使最大大小为1333，以获得最佳效果
    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'

    # propagate through the model
    outputs = model(img)

    # keep only predictions with 0.7+ confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.7

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    return probas[keep], bboxes_scaled

# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
# im = Image.open(requests.get(url, stream=True).raw)
test_img = "./test.jpg"
im = Image.open(test_img)

scores, boxes = detect(im, detr, transform)
#  scores的shape为【检测的框数，91】，boxes的shape为【检测的框数，4】

def plot_results(pil_img, prob, boxes):
    plt.figure(figsize=(16, 10))
    plt.imshow(pil_img)
    ax = plt.gca()
    for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), COLORS * 100):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        cl = p.argmax()
        text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('on')
    plt.savefig("wsg.jpg")
    plt.show()

plot_results(im, scores, boxes)