【留点代码】将transformer运用到目标检测上来,通过debug了解模型的模型运算流程

End-to-End Object Detection with Transformers

    def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()
        # create ResNet-50 backbone,从torchvision.models中导入了resnet50作为特征提取器,所以不需要全连接进行图像分类计算
        self.backbone = resnet50()
        del self.backbone.fc
        # create conversion layer,通过卷积操作实现2048维数据实现到256维的投射,使用256个1*1*2048的卷积核
        self.conv = nn.Conv2d(2048, hidden_dim, 1)
        # 从torch import nn中引入transformer模型,并将设置的超参数传入到transformer模型中
        self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)      # 做类别的预测,输出对于每个类别的置信度,还有一个未知类别
        self.linear_bbox = nn.Linear(hidden_dim, 4)                     # 做置信框的预测,输出的是个1*4的向量分别是【x1,y1,x2,y2】
        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim)) # 会生成100个置信框,前提设置为“一张图中不会超过100个物体需要检测”
        # spatial positional encodings ,空间位置编码
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        # note that in baseline DETR we use sine positional encodings,注意,在基线DETR中,我们使用正弦位置编码
  • detr前向传播函数
def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)     # nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)        # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        # convert from 2048 to 256 feature planes for the transformer,将transformer的2048个要素平面转换为256个要素平面
        h = self.conv(x)

        # construct positional encodings,构造位置编码
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)

        # propagate through the transformer  通过transformer传播
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                             self.query_pos.unsqueeze(1)).transpose(0, 1)

        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}
  • 在这里插入图片描述

  • 关于detr的前向推到可以参考学习:https://www.bilibili.com/video/BV1GB4y1X72R中30:00开始

  • 本程序detr模型是以Resnet50作为特征提取器。 Resnet的BasicBlock前向传导:

  •     def forward(self, x: Tensor) -> Tensor:
            identity = x
    
            out = self.conv1(x)
            out = self.bn1(out)
            out = self.relu(out)
    
            out = self.conv2(out)
            out = self.bn2(out)
    
            if self.downsample is not None:
                identity = self.downsample(x)
    
            out += identity
            out = self.relu(out)
    
            return out
    
  • 加载训练的权重文件

  • state_dict = torch.hub.load_state_dict_from_url( # 加载的是权重文件,load_dict,第一次是需要下载的,后面运行如果不删除可直接在本机硬盘加载
        url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
        map_location='cpu', check_hash=True)
    detr.load_state_dict(state_dict)
    
    • 在这里插入图片描述

    • 可通过阅读权重文件中的维度变化进一步理解模型的运算操作,以及每一层中的一些属性的包含,变化

  • 检测函数

    • def detect(im, model, transform):
          # mean-std normalize the input image (batch-size: 1)
          img = transform(im).unsqueeze(0)
          # 演示模型默认仅支持纵横比在0.5到2之间的图像
          # 如果要使用纵横比在此范围之外的图像
          # 重新缩放图像,使最大大小为1333,以获得最佳效果
          assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'
          # propagate through the model,将预处理好的图片传入到模型中进行计算
          outputs = model(img)
          # keep only predictions with 0.7+ confidence,仅保留0.7+置信度的预测,当置信度下调可能会检查如更多的框
          probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
          keep = probas.max(-1).values > 0.7
          # 将方框从[0;1]转换为图像比例
          bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
          return probas[keep], bboxes_scaled  #返回置信度张量,执行框在原图的标注信息
      
    • 在这里插入图片描述

  • 完整代码colab,可改动162-165行的代码推理检测自己的图片。

from PIL import Image
# 本程序可以联网下载数据(少量推理图片),模型文件,导入request包方便下载
import requests
import matplotlib.pyplot as plt

# 使用 %config InlineBackend.figure_format = 'retina' 在 %matplotlib inline后呈现更高分辨率的图像。
# %config InlineBackend.figure_format = 'retina'

import torch
from torch import nn
# backbone为resnet50,提取特征
from torchvision.models import resnet50
import torchvision.transforms as T
# 推理过程,不用设计到梯度的变化,设置为False可以节省内存,保留源模型文件
torch.set_grad_enabled(False)


class DETRdemo(nn.Module):
    """
    Demo DETR implementation.

    Demo implementation of DETR in minimal number of lines, with the
    following differences wrt DETR in the paper:
    * learned positional encoding (instead of sine)
    * positional encoding is passed at input (instead of attention)
    * fc bbox predictor (instead of MLP)
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Only batch size 1 supported.
    """
    # detr的构造函数,默认编解码器的个数各为6个,多头自注意力为8,像coco数据集的num_classes中有91个类
    def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()

        # create ResNet-50 backbone,从torchvision.models中导入了resnet50作为特征提取器,所以不需要全连接进行图像分类计算
        self.backbone = resnet50()
        del self.backbone.fc

        # create conversion layer,通过卷积操作实现2048维数据实现到256维的投射,使用256个1*1*2048的卷积核
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer,从torch import nn中引入transformer模型,并将设置的超参数传入到transformer模型中
        self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)      # 做类别的预测,输出对于每个类别的置信度,还有一个未知类别
        self.linear_bbox = nn.Linear(hidden_dim, 4)                     # 做置信框的预测,输出的是个1*4的向量分别是【x1,y1,x2,y2】

        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))      # 会生成100个置信框

        # spatial positional encodings ,空间位置编码
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        # note that in baseline DETR we use sine positional encodings,注意,在基线DETR中,我们使用正弦位置编码


    def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)     # nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)        # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        # convert from 2048 to 256 feature planes for the transformer,将transformer的2048个要素平面转换为256个要素平面
        h = self.conv(x)

        # construct positional encodings,构造位置编码
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)

        # propagate through the transformer  通过transformer传播
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                             self.query_pos.unsqueeze(1)).transpose(0, 1)

        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}


detr = DETRdemo(num_classes=91) # coco数据集,80个目标类别(object categories:行人、汽车、大象等),91种材料类别(stuff categories:草、墙、天空等)
state_dict = torch.hub.load_state_dict_from_url( # 加载的是权重文件,load_dict
    url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
    map_location='cpu', check_hash=True)
detr.load_state_dict(state_dict)
# 不启用 BatchNormalization 和 Dropout,保证BN和dropout不发生变化,pytorch框架会自动把BN和Dropout固定住,不会取平均,而是用训练好的值,不然的话,一旦test的batch_size过小,很容易就会被BN层影响结果。
detr.eval()
# train() 启用 BatchNormalization 和 Dropout
# 在利用原始.pth模型进行前向推理之前,一定要先进行model.eval()操作,不启用 BatchNormalization 和 Dropout。


# COCO classes,实体单词有80个,N/A有11个
CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),      # 将输入图片resize成800*800
    T.ToTensor(),       # 将img格式转化为tensor
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])   # 标准化,图片的平均值和方差值来自于imagenet
])

# for output bounding box post-processing,用于输出边界框后处理
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)     # 移除指定维后,返回一个元组,包含了沿着指定维切片后的各个切片。
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def detect(im, model, transform):
    # mean-std normalize the input image (batch-size: 1)
    img = transform(im).unsqueeze(0)

    # demo model only support by default images with aspect ratio between 0.5 and 2,演示模型默认仅支持纵横比在0.5到2之间的图像
    # if you want to use images with an aspect ratio outside this range,如果要使用纵横比在此范围之外的图像
    # rescale your image so that the maximum size is at most 1333 for best results,重新缩放图像,使最大大小为1333,以获得最佳效果
    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'

    # propagate through the model
    outputs = model(img)

    # keep only predictions with 0.7+ confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.7

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    return probas[keep], bboxes_scaled

# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
# im = Image.open(requests.get(url, stream=True).raw)
test_img = "./test.jpg"
im = Image.open(test_img)

scores, boxes = detect(im, detr, transform)
#  scores的shape为【检测的框数,91】,boxes的shape为【检测的框数,4】

def plot_results(pil_img, prob, boxes):
    plt.figure(figsize=(16, 10))
    plt.imshow(pil_img)
    ax = plt.gca()
    for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), COLORS * 100):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        cl = p.argmax()
        text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('on')
    plt.savefig("wsg.jpg")
    plt.show()

plot_results(im, scores, boxes)
  • test.jpg
    • 在这里插入图片描述
  • wsg.jpg,检测目标cell phone出错
    • 在这里插入图片描述
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

羞儿

写作是兴趣,打赏看心情

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值