源码来源:https://github.com/eriklindernoren/PyTorch-YOLOv3
config中存放配置文件
utils存放需要调用的函数
本文主要解读models.py和train.py文件以及其相关的函数。
在解读源码之前先略微介绍一下yolo3的相关概念。
yolo3包含卷积层(convolutional),跳过连接(short cut),yolo层,上采样层(upsample)和route层。
yolo通过带步幅的卷积对图像进行下采样,当输入图像为416×416时,利用步幅为32×32的卷积可以得到13×13的特征图,特征图上的每个像素点可以产生3个anchor(即将原图像分为13×13个网格,每个网格对应特征图中的一个像素点),每个anchor会有5+C个属性(边界框的中心坐标,w,h和边界框分数+属于C类的置信度)
在yolo3中在13×13,26×26以及52×52三个尺度上进行预测,框的中心坐标不是采用确定的坐标,而是根据图像单元格左上角的坐标偏移确定中心坐标,利用非极大值抑制解决对同一目标进行多次检测的问题。
下面先介绍一下mode文件。
class Darknet(nn.Module):
"""YOLOv3 object detection model"""
def __init__(self, config_path, img_size=416):
super(Darknet, self).__init__()
self.module_defs = parse_model_config(config_path) #模型中的参数定义,通过这个函数将配置文件中的块存储为列表形式,属性与值一一对应
self.hyperparams, self.module_list = create_modules(self.module_defs) #创建模块
self.img_size = img_size
self.seen = 0
self.header_info = np.array([0, 0, 0, self.seen, 0])
self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"]
def forward(self, x, targets=None): #前向传播
is_training = targets is not None
output = []
self.losses = defaultdict(float)
layer_outputs = []
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
x = module(x)
elif module_def["type"] == "route":
layer_i = [int(x) for x in module_def["layers"].split(",")]
x = torch.cat([layer_outputs[i] for i in layer_i], 1) #拼接特征图(按通道拼接)
elif module_def["type"] == "shortcut":
layer_i = int(module_def["from"])
x = layer_outputs[-1] + layer_outputs[layer_i] #跳跃连接
elif module_def["type"] == "yolo":
# Train phase: get loss
if is_training:
x, *losses = module[0](x, targets)
for name, loss in zip(self.loss_names, losses):
self.losses[name] += loss
# Test phase: Get detections
else:
x = module(x)
output.append(x)
layer_outputs.append(x)
self.losses["recall"] /= 3
self.losses["precision"] /= 3
return sum(output) if is_training else torch.cat(output, 1)
配置文件
[convolutional] #卷积层
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky
[shortcut] #跳过连接,与resnet类似,表示当前输出个前第三层输出与模块输入相加
from=-3
activation=linear
[yolo] #检测层,三个尺寸分别3个anchor,共九个anchor,mask有3个表示3个尺度,设置阈值和类别数
mask = 6,7,8
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
classes=80
num=9
jitter=.3
ignore_thresh = .7
truth_thresh = 1
random=1
[route] #route可能有一个值,可能有两个值(-1,61)为-4时表示将输出前第四层的特征图,为-1和61时表示将输出前一层与第61层特征图的拼接结果
layers = -4
[upsample] #上采样,三个尺度,步幅分别为32,16,8,通过上采样实现
stride=2
def parse_model_config(path):
"""Parses the yolo-v3 layer configuration file and returns module definitions"""
file = open(path, 'r') #打开配置文件
lines = file.read().split('\n')
lines = [x for x in lines if x and not x.startswith('#')]
lines = [x.rstrip().lstrip() for x in lines] # 去掉左右两边的空格
module_defs = [] #定义列表
for line in lines:
if line.startswith('['): # This marks the start of a new block
module_defs.append({}) # [{}]
module_defs[-1]['type'] = line[1:-1].rstrip() # {'type' : 'convolutional'}
if module_defs[-1]['type'] == 'convolutional':
module_defs[-1]['batch_normalize'] = 0
else:
key, value = line.split("=")
value = value.strip()
module_defs[-1][key.rstrip()] = value.strip()
return module_defs
创建模块
def create_modules(module_defs):
"""
Constructs module list of layer blocks from module configuration in module_defs
"""
hyperparams = module_defs.pop(0)
output_filters = [int(hyperparams["channels"])]
module_list = nn.ModuleList()
for i, module_def in enumerate(module_defs): #从module_defs里按顺序得到不同的module_def
modules = nn.Sequential() #顺序建模
if module_def["type"] == "convolutional":
bn = int(module_def["batch_normalize"])
filters = int(module_def["filters"])
kernel_size = int(module_def["size"])
pad = (kernel_size - 1) // 2 if int(module_def["pad"]) else 0
modules.add_module(
"conv_%d" % i,
nn.Conv2d(
in_channels=output_filters[-1],
out_channels=filters,
kernel_size=kernel_size,
stride=int(module_def["stride"]),
padding=pad,
bias=not bn,
),
)
if bn:
modules.add_module("batch_norm_%d" % i, nn.BatchNorm2d(filters))
if module_def["activation"] == "leaky":
modules.add_module("leaky_%d" % i, nn.LeakyReLU(0.1))
elif module_def["type"] == "maxpool":
kernel_size = int(module_def["size"])
stride = int(module_def["stride"])
if kernel_size == 2 and stride == 1:
padding = nn.ZeroPad2d((0, 1, 0, 1))
modules.add_module("_debug_padding_%d" % i, padding)
maxpool = nn.MaxPool2d(
kernel_size=int(module_def["size"]),
stride=int(module_def["stride"]),
padding=int((kernel_size - 1) // 2),
)
modules.add_module("maxpool_%d" % i, maxpool)
elif module_def["type"] == "upsample":
upsample = nn.Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
modules.add_module("upsample_%d" % i, upsample)
elif module_def["type"] == "route":
layers = [int(x) for x in module_def["layers"].split(",")]
filters = sum([output_filters[layer_i] for layer_i in layers])
modules.add_module("route_%d" % i, EmptyLayer()) #先放置一个空层,在darknet中再执行拼接操作
elif module_def["type"] == "shortcut":
filters = output_filters[int(module_def["from"])]
modules.add_module("shortcut_%d" % i, EmptyLayer())
elif module_def["type"] == "yolo":
anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
# Extract anchors
anchors = [int(x) for x in module_def["anchors"].split(",")]
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
anchors = [anchors[i] for i in anchor_idxs]
num_classes = int(module_def["classes"])
img_height = int(hyperparams["height"])
# Define detection layer
yolo_layer = YOLOLayer(anchors, num_classes, img_height) #检测层
modules.add_module("yolo_%d" % i, yolo_layer)
# Register module list and number of output filters
module_list.append(modules)
output_filters.append(filters)
return hyperparams, module_list