### 2. 网络结构

SHN网络名字起的很不错，级联的沙漏网络，顾名思义，沙漏网络就表示该网络具有高度对称性，多个沙漏网络进行级联，其实不级联也是可以检测的，只是检测效果会差一些，作者认为人体关节点之间有较强的相关性，前面沙漏检测出的关键点对后面的检测有帮助，所以前面的输出可以作为后面的输入的一部分，见下图的虚线部分，这个后面再讨论。

### 2.2看一下Pytorch版本实现

class HourglassNet(nn.Module):
'''Hourglass model from Newell et al ECCV 2016'''
def __init__(self, block, num_stacks=2, num_blocks=4, num_classes=16):
"""
参数解释
:param block: hg块元素
:param num_stacks: 有几个hg
:param num_blocks: 在两个hg之间有几个block块
:param num_classes: keypoint个数,也就是最后的heatmap个数
"""
super(HourglassNet, self).__init__()

self.inplanes = 64
self.num_feats = 128
self.num_stacks = num_stacks
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=True)   # 第一次下采样
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self._make_residual(block, self.inplanes, 1)  #self.planes = 64，有downsample（只是改变channel数）
self.layer2 = self._make_residual(block, self.inplanes, 1)  #有downsample（只是改变channel数）
# 这一次的bottleneck没有downsample，因为self.planes == planes(self.num_feats=128)*2 = 256
self.layer3 = self._make_residual(block, self.num_feats, 1)
self.maxpool = nn.MaxPool2d(2, stride=2)   #第二次下采样
# build hourglass modules
ch = self.num_feats*block.expansion   #128*2=256
hg, res, fc, score, fc_, score_ = [], [], [], [], [], []
for i in range(num_stacks):
hg.append(Hourglass(block, num_blocks, self.num_feats, 4))  #block, num_blocks, planes, depth=4
res.append(self._make_residual(block, self.num_feats, num_blocks))
fc.append(self._make_fc(ch, ch))
score.append(nn.Conv2d(ch, num_classes, kernel_size=1, bias=True))
if i < num_stacks-1:
fc_.append(nn.Conv2d(ch, ch, kernel_size=1, bias=True))
score_.append(nn.Conv2d(num_classes, ch, kernel_size=1, bias=True))
self.hg = nn.ModuleList(hg)
self.res = nn.ModuleList(res)
self.fc = nn.ModuleList(fc)
self.score = nn.ModuleList(score)
self.fc_ = nn.ModuleList(fc_)
self.score_ = nn.ModuleList(score_)

def _make_residual(self, block, planes, blocks, stride=1):  #planes = 64,blocks=4
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
# 这里的downsample只有改变通道数的功能，并没有下采样的功能，因为调用时stride固定为1
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=True),
)

layers = []
# 只在每个block的第一个bottleneck做downsample，因为channel数不相同
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion  #self.planes是改变的,从最开始的64，128,256
for i in range(1, blocks):   #因为blocks=1 ，后面都不会执行
layers.append(block(self.inplanes, planes))

return nn.Sequential(*layers)

def _make_fc(self, inplanes, outplanes):
bn = nn.BatchNorm2d(inplanes)
conv = nn.Conv2d(inplanes, outplanes, kernel_size=1, bias=True)
return nn.Sequential(
conv,
bn,
self.relu,
)

def forward(self, x):
out = []
x = self.conv1(x)  #下采样
x = self.bn1(x)
x = self.relu(x)

x = self.layer1(x)
x = self.maxpool(x)  #下采样
x = self.layer2(x)
x = self.layer3(x)

for i in range(self.num_stacks):
y = self.hg[i](x)
y = self.res[i](y)
y = self.fc[i](y)
score = self.score[i](y)
out.append(score)
if i < self.num_stacks-1:
fc_ = self.fc_[i](y)
score_ = self.score_[i](score)
x = x + fc_ + score_

return out

### 2.3完整网络结构

heat_map继续经过1x1卷积，将depth调整到与上部分支一致，如256，最后与上部分支合并，一起作为下一个沙漏网络的输入。

### 2.4 完整网络结构PyTorch实现

class HourglassNet(nn.Module):
'''Hourglass model from Newell et al ECCV 2016'''
def __init__(self, block, num_stacks=2, num_blocks=4, num_classes=16):
"""
参数解释
:param block: hg块元素
:param num_stacks: 有几个hg
:param num_blocks: 在两个hg之间有几个block块
:param num_classes: keypoint个数,也就是最后的heatmap个数
"""
super(HourglassNet, self).__init__()

self.inplanes = 64
self.num_feats = 128
self.num_stacks = num_stacks
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=True)
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self._make_residual(block, self.inplanes, 1)  #self.planes = 64
self.layer2 = self._make_residual(block, self.inplanes, 1)
self.layer3 = self._make_residual(block, self.num_feats, 1)  #这一次的bottleneck没有downsample，因为self.planes == planes(self.num_feats=128)*2 = 256
self.maxpool = nn.MaxPool2d(2, stride=2)   #TODO 这个maxpool需不需要。论文里是有2次下采样，从256降到64，

# build hourglass modules
ch = self.num_feats*block.expansion   #128*2=256
hg, res, fc, score, fc_, score_ = [], [], [], [], [], []
for i in range(num_stacks):
hg.append(Hourglass(block, num_blocks, self.num_feats, 4))  #block, num_blocks, planes, depth=4
res.append(self._make_residual(block, self.num_feats, num_blocks))
fc.append(self._make_fc(ch, ch))
score.append(nn.Conv2d(ch, num_classes, kernel_size=1, bias=True))
if i < num_stacks-1:
fc_.append(nn.Conv2d(ch, ch, kernel_size=1, bias=True))
score_.append(nn.Conv2d(num_classes, ch, kernel_size=1, bias=True))
self.hg = nn.ModuleList(hg)
self.res = nn.ModuleList(res)
self.fc = nn.ModuleList(fc)
self.score = nn.ModuleList(score)
self.fc_ = nn.ModuleList(fc_)
self.score_ = nn.ModuleList(score_)

def _make_residual(self, block, planes, blocks, stride=1):  #planes = 64,blocks=4
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=True),
)

layers = []
layers.append(block(self.inplanes, planes, stride, downsample))  #只在每个block的第一个bottleneck做下采样，因为channel数不相同
self.inplanes = planes * block.expansion  #self.planes是改变的,从最开始的64，128,256
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))

return nn.Sequential(*layers)

def _make_fc(self, inplanes, outplanes):
bn = nn.BatchNorm2d(inplanes)
conv = nn.Conv2d(inplanes, outplanes, kernel_size=1, bias=True)
return nn.Sequential(
conv,
bn,
self.relu,
)

def forward(self, x):
out = []
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)

x = self.layer1(x)
x = self.maxpool(x)
x = self.layer2(x)
x = self.layer3(x)

for i in range(self.num_stacks):
y = self.hg[i](x)
y = self.res[i](y)
y = self.fc[i](y)
score = self.score[i](y)
out.append(score)
if i < self.num_stacks-1:
fc_ = self.fc_[i](y)
score_ = self.score_[i](score)
x = x + fc_ + score_

return out

fc和score分别表示hourglass的输出两个支路，score是得到heatmaps，经过的卷积的channel 数好keypoints个数相同。fc_和score_分别表示当后面还需要级联Hourglass时，需要做一些1*1的卷积改变featuremaps的通道数，这样后面才能做按元素相加，然后作为后面的输入。

### 5. 结论与结果

1. 设计了一个新的单人姿态估计网络Hourglass，效果也是棒棒的，如果用于多人需要单独的行人检测作为前端预处理。
2. 中继监督的作用很大，
3. 级联的Hourglass效果非常好，当时sota方法
4. 但对一些遮挡问题难以处理，这是绝大部分算法的难题

[1] Newell A , Yang K , Deng J . Stacked Hourglass Networks for Human Pose Estimation[J]. 2016.

[4] https://blog.csdn.net/shenxiaolu1984/article/details/51428392