一、介绍
本文提出了一种通过图神经网络扩展到时空图模型,设计了一种用于动作识别的骨架序列的表示,也就是时空图卷积网络——ST-GCN。如图所示,是构建的一个骨架时空图,其中每个节点对应的是人体的一个关节。边有两种类型,一种是符合关节自然连接的空间边,另一种是跨越连续时间步长连接相同关节的时间边。
本文的贡献就是提出了ST-GCN,这是第一个将基于图的神经网络应用于动作识别。然后根据具体要求,提出了ST-GCN中卷积核的设计原则。
下面是文章的核心公式:
二、注意点
论文与代码实现有三个不同点:
1.关于第三种空间划分策略,论文中是先求出所有骨骼点的重心,然后再对邻域节点到重心的距离与根节点到重心的距离进行比较,划分为三个区域。而代码实现中是实现定义好中心节点,然后比较邻域节点和根节点分别到中心节点的跳步数,划分为三个区域。
2.归一化的使用论文中和代码实现不同,论文中是对矩阵A左右同时乘上度矩阵,代码中时右乘度矩阵。
3.TCN模块在时间维度上实现不一样,代码是用在时间维度上进行二维卷积实现,而论文中时用以下公式实现:
三、代码实现
1.graph.py
import numpy as np
class Graph():
""" The Graph to model the skeletons extracted by the openpose
Args:
strategy (string): must be one of the follow candidates
- uniform: Uniform Labeling
- distance: Distance Partitioning
- spatial: Spatial Configuration
For more information, please refer to the section 'Partition Strategies'
in our paper (https://arxiv.org/abs/1801.07455).
layout (string): must be one of the follow candidates
- openpose: Is consists of 18 joints. For more information, please
refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose#output
- ntu-rgb+d: Is consists of 25 joints. For more information, please
refer to https://github.com/shahroudy/NTURGB-D
max_hop (int): the maximal distance between two connected nodes
dilation (int): controls the spacing between the kernel points
"""
def __init__(self,
layout='openpose',
strategy='uniform',
max_hop=1,
dilation=1):
self.max_hop = max_hop
self.dilation = dilation
# 确定图中节点间边的关系
self.get_edge(layout)
# 获得距离矩阵 0,1,inf
self.hop_dis = get_hop_distance(
self.num_node, self.edge, max_hop=max_hop)
self.get_adjacency(strategy) # 根据不同分区策略获得不同邻接矩阵
def __str__(self):
return self.A
def get_edge(self, layout):
if layout == 'openpose':
self.num_node = 18
self_link = [(i, i) for i in range(self.num_node)]
neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12,
11),
(10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1),
(0, 1), (15, 0), (14, 0), (17, 15), (16, 14)]
self.edge = self_link + neighbor_link
self.center = 1
elif layout == 'ntu-rgb+d':
self.num_node = 25
self_link = [(i, i) for i in range(self.num_node)]
neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
(6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
(11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
(16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
(22, 23), (23, 8), (24, 25), (25, 12)]
neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
self.edge = self_link + neighbor_link
self.center = 21 - 1
elif layout == 'ntu_edge':
self.num_node = 24
self_link = [(i, i) for i in range(self.num_node)]
neighbor_1base = [(1, 2), (3, 2), (4, 3), (5, 2), (6, 5), (7, 6),
(8, 7), (9, 2), (10, 9), (11, 10), (12, 11),
(13, 1), (14, 13), (15, 14), (16, 15), (17, 1),
(18, 17), (19, 18), (20, 19), (21, 22), (22, 8),
(23, 24), (24, 12)]
neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
self.edge = self_link + neighbor_link
self.center = 2
# elif layout=='customer settings'
# pass
else:
raise ValueError("Do Not Exist This Layout.")
# 根据分区策略获得邻接矩阵
def get_adjacency(self, strategy):
valid_hop = range(0, self.max_hop + 1, self.dilation)
adjacency = np.zeros((self.num_node, self.num_node))
for hop in valid_hop:
# 得到一个邻接矩阵 相连的节点为1 root节点也为1 和 hop_dis的区别就在 root节点的值 还有剩下的节点值为0 hop_dis中为inf
adjacency[self.hop_dis == hop] = 1
# 这里是做矩阵的归一化也就是用度矩阵做归一化
normalize_adjacency = normalize_digraph(adjacency)
if strategy == 'uniform': # 这个划分策略表示Uni-labeling
A = np.zeros((1, self.num_node, self.num_node))
A[0] = normalize_adjacency
self.A = A
elif strategy == 'distance': # 这个就是distance partition
A = np.zeros((len(valid_hop), self.num_node, self.num_node))
for i, hop in enumerate(valid_hop):
# hop == 0 : 从hop_dis中取出节点值等于0的赋值
# hop == 1 : 从hop_dis中取出节点值等于1的赋值
A[i][self.hop_dis == hop] = normalize_adjacency[self.hop_dis ==hop]
self.A = A
elif strategy == 'spatial':
# 这里用一个数组存储
A = []
# 这个if 表示取出有效值 hop_dis中的 0, 1 也就是有边链接关系的节点包括root node itself
for hop in valid_hop:
a_root = np.zeros((self.num_node, self.num_node))
# the neighboring nodes that are closer to the gravity center
a_close = np.zeros((self.num_node, self.num_node))
# otherwise the centrifugal group
a_further = np.zeros((self.num_node, self.num_node))
for i in range(self.num_node):
for j in range(self.num_node):
if self.hop_dis[j, i] == hop:
if self.hop_dis[j, self.center] == self.hop_dis[
i, self.center]:
a_root[j, i] = normalize_adjacency[j, i]
elif self.hop_dis[j, self.
center] > self.hop_dis[i, self.
center]:
a_close[j, i] = normalize_adjacency[j, i]
else:
a_further[j, i] = normalize_adjacency[j, i]
if hop == 0:
A.append(a_root)
else:
A.append(a_root + a_close)
A.append(a_further)
# 最终拼成一个三维矩阵当作权重输入模型
# shape (3, num_node, num_node)
# A[0] 有root节点还有和center相连的节点赋予权重值(也就是距离值)
# A[1] (a_root + a_close)在A[0]上增加了比root距离中心点近的权重值
# A[2] 就是比root距离中心点远的权重值
A = np.stack(A)
self.A = A
else:
raise ValueError("Do Not Exist This Strategy")
def get_hop_distance(num_node, edge, max_hop=1):
A = np.zeros((num_node, num_node))
# 得到邻接矩阵 A
for i, j in edge:
A[j, i] = 1
A[i, j] = 1
# compute hop steps
hop_dis = np.zeros((num_node, num_node)) + np.inf
transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
# transfer_mat是list类型,需要将list堆叠成一个数组才能进行>操作
arrive_mat = (np.stack(transfer_mat) > 0)
for d in range(max_hop, -1, -1):
hop_dis[arrive_mat[d]] = d
return hop_dis
def normalize_digraph(A):
Dl = np.sum(A, 0)
num_node = A.shape[0]
Dn = np.zeros((num_node, num_node))
for i in range(num_node):
if Dl[i] > 0:
Dn[i, i] = Dl[i]**(-1)
AD = np.dot(A, Dn)
return AD
def normalize_undigraph(A):
Dl = np.sum(A, 0)
num_node = A.shape[0]
Dn = np.zeros((num_node, num_node))
for i in range(num_node):
if Dl[i] > 0:
Dn[i, i] = Dl[i]**(-0.5)
DAD = np.dot(np.dot(Dn, A), Dn)
return DAD
2.st-gcn.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from net.utils.tgcn import ConvTemporalGraphical
from net.utils.graph import Graph
class Model(nn.Module):
r"""Spatial temporal graph convolutional networks.
Args:
in_channels (int): Number of channels in the input data
num_class (int): Number of classes for the classification task
graph_args (dict): The arguments for building the graph
edge_importance_weighting (bool): If ``True``, adds a learnable
importance weighting to the edges of the graph
**kwargs (optional): Other parameters for graph convolution units
Shape:
- Input: :math:`(N, in_channels, T_{in}, V_{in}, M_{in})`
- Output: :math:`(N, num_class)` where
:math:`N` is a batch size,
:math:`T_{in}` is a length of input sequence,
:math:`V_{in}` is the number of graph nodes,
:math:`M_{in}` is the number of instance in a frame.
"""
def __init__(self, in_channels, num_class, graph_args,
edge_importance_weighting, **kwargs):
super().__init__()
# load graph和相应的邻接矩阵
self.graph = Graph(**graph_args)
# requires_grad=False便于求导和更新参数
A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False)
# 保存参数
self.register_buffer('A', A)
# build networks
# 空间核大小 就是A 0维(N)的值,也就是batch size
spatial_kernel_size = A.size(0)
temporal_kernel_size = 9
kernel_size = (temporal_kernel_size, spatial_kernel_size)
# 归一化b
self.data_bn = nn.BatchNorm1d(in_channels * A.size(1))
kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
self.st_gcn_networks = nn.ModuleList((
st_gcn(in_channels, 64, kernel_size, 1, residual=False, **kwargs0),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 128, kernel_size, 2, **kwargs), # 步长为2,作为池化层
st_gcn(128, 128, kernel_size, 1, **kwargs),
st_gcn(128, 128, kernel_size, 1, **kwargs),
st_gcn(128, 256, kernel_size, 2, **kwargs), # 步长为2,作为池化层
st_gcn(256, 256, kernel_size, 1, **kwargs),
st_gcn(256, 256, kernel_size, 1, **kwargs),
))
# initialize parameters for edge importance weighting
if edge_importance_weighting: # 初始化为1,可训练
self.edge_importance = nn.ParameterList([
nn.Parameter(torch.ones(self.A.size()))
for i in self.st_gcn_networks
])
else:
# else 让边权重不可学习 设置成1 表示权重都一样
self.edge_importance = [1] * len(self.st_gcn_networks)
# fcn for prediction 全连接层
self.fcn = nn.Conv2d(256, num_class, kernel_size=1)
# 整个Moule的forward函数
def forward(self, x):
# 网络的输入(N = batch_size,C = 3,T = frame_num,V = 18(node_num),M = 2)
N, C, T, V, M = x.size()
x = x.permute(0, 4, 3, 1, 2).contiguous()
x = x.view(N * M, V * C, T)
x = self.data_bn(x)
x = x.view(N, M, V, C, T)
x = x.permute(0, 1, 3, 4, 2).contiguous()
x = x.view(N * M, C, T, V) # (256*2,3,150,18)
# forwad
for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
# 邻接矩阵*重要性权重
x, _ = gcn(x, self.A * importance)
# global pooling
x = F.avg_pool2d(x, x.size()[2:])
x = x.view(N, M, -1, 1, 1).mean(dim=1)
# prediction
x = self.fcn(x)
x = x.view(x.size(0), -1)
return x
def extract_feature(self, x):
# data normalization
N, C, T, V, M = x.size()
x = x.permute(0, 4, 3, 1, 2).contiguous()
x = x.view(N * M, V * C, T)
x = self.data_bn(x)
x = x.view(N, M, V, C, T)
x = x.permute(0, 1, 3, 4, 2).contiguous()
x = x.view(N * M, C, T, V)
# forwad
for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
x, _ = gcn(x, self.A * importance)
_, c, t, v = x.size()
feature = x.view(N, M, c, t, v).permute(0, 2, 3, 4, 1)
# prediction
x = self.fcn(x)
output = x.view(N, M, -1, t, v).permute(0, 2, 3, 4, 1)
return output, feature
class st_gcn(nn.Module):
r"""Applies a spatial temporal graph convolution over an input graph sequence.
Args:
in_channels (int): Number of channels in the input sequence data
out_channels (int): Number of channels produced by the convolution
kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel
stride (int, optional): Stride of the temporal convolution. Default: 1
dropout (int, optional): Dropout rate of the final output. Default: 0
residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True``
Shape:
- Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
- Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
- Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format
- Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
where
:math:`N` is a batch size,
:math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
:math:`T_{in}/T_{out}` is a length of input/output sequence,
:math:`V` is the number of graph nodes.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
dropout=0,
residual=True):
super().__init__()
assert len(kernel_size) == 2
assert kernel_size[0] % 2 == 1
padding = ((kernel_size[0] - 1) // 2, 0)
self.gcn = ConvTemporalGraphical(in_channels, out_channels,
kernel_size[1])
self.tcn = nn.Sequential(
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(
out_channels,
out_channels,
(kernel_size[0], 1),
(stride, 1),
padding,
),
nn.BatchNorm2d(out_channels),
nn.Dropout(dropout, inplace=True),
)
# 残差结构
if not residual:
self.residual = lambda x: 0
elif (in_channels == out_channels) and (stride == 1):
self.residual = lambda x: x
else:
self.residual = nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=(stride, 1)),
nn.BatchNorm2d(out_channels),
)
self.relu = nn.ReLU(inplace=True)
def forward(self, x, A):
res = self.residual(x)
# 空间图卷积
x, A = self.gcn(x, A)
# 时间图卷积
x = self.tcn(x) + res
# 激活函数
return self.relu(x), A
3.tgcn.py
# The based unit of graph convolutional networks.
import torch
import torch.nn as nn
class ConvTemporalGraphical(nn.Module):
r"""The basic module for applying a graph convolution.
Args:
in_channels (int): Number of channels in the input sequence data
out_channels (int): Number of channels produced by the convolution
kernel_size (int): Size of the graph convolving kernel
t_kernel_size (int): Size of the temporal convolving kernel
t_stride (int, optional): Stride of the temporal convolution. Default: 1
t_padding (int, optional): Temporal zero-padding added to both sides of
the input. Default: 0
t_dilation (int, optional): Spacing between temporal kernel elements.
Default: 1
bias (bool, optional): If ``True``, adds a learnable bias to the output.
Default: ``True``
Shape:
- Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
- Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
- Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format
- Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
where
:math:`N` is a batch size,
:math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
:math:`T_{in}/T_{out}` is a length of input/output sequence,
:math:`V` is the number of graph nodes.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
t_kernel_size=1,
t_stride=1,
t_padding=0,
t_dilation=1,
bias=True):
super().__init__()
# 这个kernel_size指的是空间上的kernal_size,也等于分区策略划分的子集数K
self.kernel_size = kernel_size
self.conv = nn.Conv2d(
in_channels,
out_channels * kernel_size,
kernel_size=(t_kernel_size, 1),
padding=(t_padding, 0),
stride=(t_stride, 1),
dilation=(t_dilation, 1),
bias=bias)
def forward(self, x, A):
assert A.size(0) == self.kernel_size
# 这里输入x是(N,C,T,V),经过conv(x)之后变为(N,C*kernel_size,T,V)
x = self.conv(x)
n, kc, t, v = x.size()
x = x.view(n, self.kernel_size, kc//self.kernel_size, t, v)
# 爱因斯坦约定求和法
x = torch.einsum('nkctv,kvw->nctw', (x, A))
return x.contiguous(), A
http:// https://github.com/yysijie/st-gcn
代码链接