End-to-End Object Detection with Transformers
-
Facebook提出的基于Transformer的端到端目标检测网络,发表于ECCV2020,代码已开源:facebookresearch/detr: End-to-End Object Detection with Transformers (github.com),这里留存的是一个简化版的运用detr训练好的权重文件对图像进行推理检测的模型。
-
detr模型构造函数
def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6):
super().__init__()
# create ResNet-50 backbone,从torchvision.models中导入了resnet50作为特征提取器,所以不需要全连接进行图像分类计算
self.backbone = resnet50()
del self.backbone.fc
# create conversion layer,通过卷积操作实现2048维数据实现到256维的投射,使用256个1*1*2048的卷积核
self.conv = nn.Conv2d(2048, hidden_dim, 1)
# 从torch import nn中引入transformer模型,并将设置的超参数传入到transformer模型中
self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
self.linear_class = nn.Linear(hidden_dim, num_classes + 1) # 做类别的预测,输出对于每个类别的置信度,还有一个未知类别
self.linear_bbox = nn.Linear(hidden_dim, 4) # 做置信框的预测,输出的是个1*4的向量分别是【x1,y1,x2,y2】
# output positional encodings (object queries)
self.query_pos = nn.Parameter(torch.rand(100, hidden_dim)) # 会生成100个置信框,前提设置为“一张图中不会超过100个物体需要检测”
# spatial positional encodings ,空间位置编码
self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
# note that in baseline DETR we use sine positional encodings,注意,在基线DETR中,我们使用正弦位置编码
- detr前向传播函数
def forward(self, inputs):
# propagate inputs through ResNet-50 up to avg-pool layer
x = self.backbone.conv1(inputs) # nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
x = self.backbone.bn1(x)
x = self.backbone.relu(x)
x = self.backbone.maxpool(x) # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
x = self.backbone.layer1(x)
x = self.backbone.layer2(x)
x = self.backbone.layer3(x)
x = self.backbone.layer4(x)
# convert from 2048 to 256 feature planes for the transformer,将transformer的2048个要素平面转换为256个要素平面
h = self.conv(x)
# construct positional encodings,构造位置编码
H, W = h.shape[-2:]
pos = torch.cat([
self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
], dim=-1).flatten(0, 1).unsqueeze(1)
# propagate through the transformer 通过transformer传播
h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
self.query_pos.unsqueeze(1)).transpose(0, 1)
# finally project transformer outputs to class labels and bounding boxes
return {'pred_logits': self.linear_class(h),
'pred_boxes': self.linear_bbox(h).sigmoid()}
-
关于detr的前向推到可以参考学习:https://www.bilibili.com/video/BV1GB4y1X72R中30:00开始
-
本程序detr模型是以Resnet50作为特征提取器。 Resnet的BasicBlock前向传导:
-
def forward(self, x: Tensor) -> Tensor: identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out
-
加载训练的权重文件
-
state_dict = torch.hub.load_state_dict_from_url( # 加载的是权重文件,load_dict,第一次是需要下载的,后面运行如果不删除可直接在本机硬盘加载 url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth', map_location='cpu', check_hash=True) detr.load_state_dict(state_dict)
-
可通过阅读权重文件中的维度变化进一步理解模型的运算操作,以及每一层中的一些属性的包含,变化
-
检测函数
-
def detect(im, model, transform): # mean-std normalize the input image (batch-size: 1) img = transform(im).unsqueeze(0) # 演示模型默认仅支持纵横比在0.5到2之间的图像 # 如果要使用纵横比在此范围之外的图像 # 重新缩放图像,使最大大小为1333,以获得最佳效果 assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side' # propagate through the model,将预处理好的图片传入到模型中进行计算 outputs = model(img) # keep only predictions with 0.7+ confidence,仅保留0.7+置信度的预测,当置信度下调可能会检查如更多的框 probas = outputs['pred_logits'].softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.7 # 将方框从[0;1]转换为图像比例 bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size) return probas[keep], bboxes_scaled #返回置信度张量,执行框在原图的标注信息
-
-
完整代码colab,可改动162-165行的代码推理检测自己的图片。
from PIL import Image
# 本程序可以联网下载数据(少量推理图片),模型文件,导入request包方便下载
import requests
import matplotlib.pyplot as plt
# 使用 %config InlineBackend.figure_format = 'retina' 在 %matplotlib inline后呈现更高分辨率的图像。
# %config InlineBackend.figure_format = 'retina'
import torch
from torch import nn
# backbone为resnet50,提取特征
from torchvision.models import resnet50
import torchvision.transforms as T
# 推理过程,不用设计到梯度的变化,设置为False可以节省内存,保留源模型文件
torch.set_grad_enabled(False)
class DETRdemo(nn.Module):
"""
Demo DETR implementation.
Demo implementation of DETR in minimal number of lines, with the
following differences wrt DETR in the paper:
* learned positional encoding (instead of sine)
* positional encoding is passed at input (instead of attention)
* fc bbox predictor (instead of MLP)
The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
Only batch size 1 supported.
"""
# detr的构造函数,默认编解码器的个数各为6个,多头自注意力为8,像coco数据集的num_classes中有91个类
def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6):
super().__init__()
# create ResNet-50 backbone,从torchvision.models中导入了resnet50作为特征提取器,所以不需要全连接进行图像分类计算
self.backbone = resnet50()
del self.backbone.fc
# create conversion layer,通过卷积操作实现2048维数据实现到256维的投射,使用256个1*1*2048的卷积核
self.conv = nn.Conv2d(2048, hidden_dim, 1)
# create a default PyTorch transformer,从torch import nn中引入transformer模型,并将设置的超参数传入到transformer模型中
self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
# prediction heads, one extra class for predicting non-empty slots
# note that in baseline DETR linear_bbox layer is 3-layer MLP
self.linear_class = nn.Linear(hidden_dim, num_classes + 1) # 做类别的预测,输出对于每个类别的置信度,还有一个未知类别
self.linear_bbox = nn.Linear(hidden_dim, 4) # 做置信框的预测,输出的是个1*4的向量分别是【x1,y1,x2,y2】
# output positional encodings (object queries)
self.query_pos = nn.Parameter(torch.rand(100, hidden_dim)) # 会生成100个置信框
# spatial positional encodings ,空间位置编码
self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
# note that in baseline DETR we use sine positional encodings,注意,在基线DETR中,我们使用正弦位置编码
def forward(self, inputs):
# propagate inputs through ResNet-50 up to avg-pool layer
x = self.backbone.conv1(inputs) # nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
x = self.backbone.bn1(x)
x = self.backbone.relu(x)
x = self.backbone.maxpool(x) # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
x = self.backbone.layer1(x)
x = self.backbone.layer2(x)
x = self.backbone.layer3(x)
x = self.backbone.layer4(x)
# convert from 2048 to 256 feature planes for the transformer,将transformer的2048个要素平面转换为256个要素平面
h = self.conv(x)
# construct positional encodings,构造位置编码
H, W = h.shape[-2:]
pos = torch.cat([
self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
], dim=-1).flatten(0, 1).unsqueeze(1)
# propagate through the transformer 通过transformer传播
h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
self.query_pos.unsqueeze(1)).transpose(0, 1)
# finally project transformer outputs to class labels and bounding boxes
return {'pred_logits': self.linear_class(h),
'pred_boxes': self.linear_bbox(h).sigmoid()}
detr = DETRdemo(num_classes=91) # coco数据集,80个目标类别(object categories:行人、汽车、大象等),91种材料类别(stuff categories:草、墙、天空等)
state_dict = torch.hub.load_state_dict_from_url( # 加载的是权重文件,load_dict
url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
map_location='cpu', check_hash=True)
detr.load_state_dict(state_dict)
# 不启用 BatchNormalization 和 Dropout,保证BN和dropout不发生变化,pytorch框架会自动把BN和Dropout固定住,不会取平均,而是用训练好的值,不然的话,一旦test的batch_size过小,很容易就会被BN层影响结果。
detr.eval()
# train() 启用 BatchNormalization 和 Dropout
# 在利用原始.pth模型进行前向推理之前,一定要先进行model.eval()操作,不启用 BatchNormalization 和 Dropout。
# COCO classes,实体单词有80个,N/A有11个
CLASSES = [
'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
'toothbrush'
]
# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
[0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]
# standard PyTorch mean-std input image normalization
transform = T.Compose([
T.Resize(800), # 将输入图片resize成800*800
T.ToTensor(), # 将img格式转化为tensor
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # 标准化,图片的平均值和方差值来自于imagenet
])
# for output bounding box post-processing,用于输出边界框后处理
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(1) # 移除指定维后,返回一个元组,包含了沿着指定维切片后的各个切片。
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=1)
def rescale_bboxes(out_bbox, size):
img_w, img_h = size
b = box_cxcywh_to_xyxy(out_bbox)
b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
return b
def detect(im, model, transform):
# mean-std normalize the input image (batch-size: 1)
img = transform(im).unsqueeze(0)
# demo model only support by default images with aspect ratio between 0.5 and 2,演示模型默认仅支持纵横比在0.5到2之间的图像
# if you want to use images with an aspect ratio outside this range,如果要使用纵横比在此范围之外的图像
# rescale your image so that the maximum size is at most 1333 for best results,重新缩放图像,使最大大小为1333,以获得最佳效果
assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'
# propagate through the model
outputs = model(img)
# keep only predictions with 0.7+ confidence
probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
keep = probas.max(-1).values > 0.7
# convert boxes from [0; 1] to image scales
bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
return probas[keep], bboxes_scaled
# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
# im = Image.open(requests.get(url, stream=True).raw)
test_img = "./test.jpg"
im = Image.open(test_img)
scores, boxes = detect(im, detr, transform)
# scores的shape为【检测的框数,91】,boxes的shape为【检测的框数,4】
def plot_results(pil_img, prob, boxes):
plt.figure(figsize=(16, 10))
plt.imshow(pil_img)
ax = plt.gca()
for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), COLORS * 100):
ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
fill=False, color=c, linewidth=3))
cl = p.argmax()
text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
ax.text(xmin, ymin, text, fontsize=15,
bbox=dict(facecolor='yellow', alpha=0.5))
plt.axis('on')
plt.savefig("wsg.jpg")
plt.show()
plot_results(im, scores, boxes)
- test.jpg
- wsg.jpg,检测目标cell phone出错