前文:
-
参考代码
本篇讲解ASTGCN的模型框架以及用到的小函数。
文章目录
import torch
import torch.nn as nn
import torch.nn.functional as F
from lib.utils import scaled_Laplacian, cheb_polynomial
壹、模型框架
一、文件内容
查看以下该.py
文件中的class类,并确定class之间的关系。
二、算法流程
三、make_model
- 输入变量
变量(本函数) | 又名(传递) | 类型 | 举例 | 用途 |
---|---|---|---|---|
DEVICE | str | “cpu” | 模型运行的设备 | |
nb_block | int | 2 | ST-Block的个数 | |
in_channels | int | 1 | 输入特征F的维度 | |
K | int | 3 | 切比雪夫多项式的阶 | |
nb_chev_filter | int | 64 | cheb卷积层中输出特征的维度 | |
nb_time_filter | int | 64 | 时间卷积层中输出特征的维度 | |
time_strides | =num_of_hours | int | 1 | 时间卷积层中在时间轴上滑动的距离 |
cheb_polynomials | np.array | adj_mx | 用于多项式的矩阵 | |
nb_predict_step | =num_for_predict | int | 12 | 用于预测多长时间的结果,if12,so预测1小时 |
len_input | num_of_timesteps | int | 12 | 是指时间维度T的值❓ |
num_of_vertices | int | 307 | if PEMS04,so顶点个数为307 |
def make_model(DEVICE, nb_block, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, adj_mx, num_for_predict, len_input, num_of_vertices):
L_tilde = scaled_Laplacian(adj_mx)
cheb_polynomials = [torch.from_numpy(i).type(torch.FloatTensor).to(DEVICE) # 将数组(ndarray)i 转换为tensor
for i in cheb_polynomial(L_tilde, K)] # i = 0,1,2(3组矩阵)
model = ASTGCN_submodule(DEVICE, nb_block, in_channels, K, nb_chev_filter, nb_time_filter,
time_strides, cheb_polynomials, num_for_predict, len_input, num_of_vertices)
# this code is very important!
# It initialises the parameters with a range of values that stops the signal fading or getting too big.
# Initialize parameters with Glorot / fan_avg.
# Resetting the parameters.
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p) # Xavier 初始化确保权重“恰到好处”
else:
nn.init.uniform_(p)
return model
贰、时间和空间注意力层
一、Temporal_Attention_layer
通过该module获得时间注意力矩阵E_normalized.
E
=
V
e
⋅
σ
(
(
(
E=V_e \cdot \sigma (( (
E=Ve⋅σ(((
X
h
(
r
−
1
)
\mathcal{X}^{(r-1)}_{h}
Xh(r−1)
)
T
U
1
)
U
2
(
U
3
X
h
(
r
−
1
)
)
+
b
e
)
)^T U_1) \ U_2\ (U_3\mathcal{X}^{(r-1)}_h) \ +b_e)
)TU1) U2 (U3Xh(r−1)) +be)
E
i
,
j
′
=
e
x
p
(
E
i
,
j
)
∑
j
=
1
T
r
−
1
e
x
p
(
E
i
,
j
)
E'_{i,j}=\frac{exp(E_{i,j})}{\sum^{T_{r-1}}_{j=1}exp(E_{i,j})}
Ei,j′=∑j=1Tr−1exp(Ei,j)exp(Ei,j)
where,
V e , b e ∈ R T r − 1 × T r − 1 V_e,b_e \in R^{T_{r-1}\times T_{r-1}} Ve,be∈RTr−1×Tr−1 U 1 ∈ R N U_1 \in R^N U1∈RN, U 2 ∈ R C r − 1 × N U_2 \in R^{C_{r-1}\times N} U2∈RCr−1×N U 3 ∈ R C r − 1 U_3 \in R^{C_{r-1}} U3∈RCr−1
E i , j ∈ E E_{i,j} \in E Ei,j∈E在语义上表示节点i&j
之间的的依赖关系强度。最后,E被softmax函数规范化。
输入: X h ( r − 1 ) = ( X 1 , X 2 , . . . , X T r − 1 ) ∈ R N × C r − 1 × T r − 1 \mathcal{X}^{(r-1)}_h=(X_1,X_2,...,X_{T_{r-1}}) \in R^{N\times C_{r-1} \times T_{r-1}} Xh(r−1)=(X1,X2,...,XTr−1)∈RN×Cr−1×Tr−1, r − 1 r-1 r−1表示第 r r r层
输出: E ′ E' E′
作用: X ~ h ( r − 1 ) = ( X 1 ~ , X 2 ~ , . . . , X ~ T r − 1 ) = ( X 1 , X 2 , . . . , X T r − 1 ) E ′ ∈ R N × C r − 1 × T r − 1 \tilde{\mathcal{X}}^{(r-1)}_h=(\tilde{X_1},\tilde{X_2},...,\tilde{X}_{T_{r-1}}) =(X_1,X_2,...,X_{T_{r-1}})E' \in R^{N\times C_{r-1} \times T_{r-1}} X~h(r−1)=(X1~,X2~,...,X~Tr−1)=(X1,X2,...,XTr−1)E′∈RN×Cr−1×Tr−1
- 实例化参数
变量 | 类型 | 举例 | 用途 |
---|---|---|---|
DEVICE | str | “cpu” | 将变量和模型放到设备上运行 |
in_channels | int | 1 | 输入特征的维度F |
num_of_vertices | int | 307 | 顶点个数 |
num_of_timesteps | int | 12 | 时间轴的数目T=12 |
- 函数流程
def _init__
1. 初始化
2. 设置可学习参数的结构,并放到DEVICE上
U1:(N, ) =(307,)
U2:(F,N) =(1,307)
U3:(F, ) =(1,)
be:(1,T,T) = (1, 12, 12)
Ve:(T,T) = (12, 12)
def forward
3. 输入变量x:shape=(B, N, F, T).与train_loader,val_loader,test_loader的形状类似
4. lhs= (x^T)*U1,再*U2.
x作转置是为了可以与先U1相乘,因此x^T=(B,不确定,不确定,N)
(x^T)*U=(B,不确定,不确定),为了继续可以和U2=(F,N)相乘,因此(B,T,F)
所以,x的转职操作有x.permute(0, 3, 2, 1)即(B,N,F,T)->(B,T,F,N)。
(B,T,F,N)*(N,)*(F,N)=(B,T,F)*(F,N)=(B,T,N)
5. rhs=U3*x. (F,)*(B,N,F,T) =(B,N,T)
6. product=lhs*rhs (B,T,N)*(B,N,T)=(B,T,T) l:left,r:right
7. E = Ve * sigmoid(product+be) shape is (B,T,T)
8. E_normalized = softmax(E) shape is (B,T,T)
class Temporal_Attention_layer(nn.Module):
def __init__(self, DEVICE, in_channels, num_of_vertices, num_of_timesteps):
super(Temporal_Attention_layer, self).__init__()
# 按照nn.Module的初始化方法初始化
self.U1 = nn.Parameter(torch.FloatTensor(num_of_vertices).to(DEVICE)) # 307个顶点(传感器)
self.U2 = nn.Parameter(torch.FloatTensor(in_channels, num_of_vertices).to(DEVICE)) #(1, 307)
self.U3 = nn.Parameter(torch.FloatTensor(in_channels).to(DEVICE)) # (1)
self.be = nn.Parameter(torch.FloatTensor(1, num_of_timesteps, num_of_timesteps).to(DEVICE))
self.Ve = nn.Parameter(torch.FloatTensor(num_of_timesteps, num_of_timesteps).to(DEVICE))
def forward(self, x):
'''
# (32, 307, 1, 12) -permute(变换)-> (32, 12, 1, 307) * (307)=(32,12,1) -> (32, 12, 1) * (1, 307)
'''
_, num_of_vertices, num_of_features, num_of_timesteps = x.shape
# lhs = left hand side embedding
# (32, 12, 307) # one signal (mean of all detectors) for each timestamp then regenerate them
lhs = torch.matmul(torch.matmul(x.permute(0, 3, 2, 1), self.U1), self.U2)
# permute x:(B, N, F_in, T) -> (B, T, F_in, N)
# multiply with U1 (B, T, F_in, N)(N) -> (B,T,F_in)
# multiply with U2 (B,T,F_in)(F_in,N)->(B,T,N)
# for example (32, 307, 1, 12) -permute-> (32, 12, 1, 307) * (307) -> (32, 12, 1) * (1, 307) -> (32, 12, 307)
# rhs = right hand side embedding
# one feature (mean of all features) for each detector
rhs = torch.matmul(self.U3, x)
# mutliple U3 with X (F)(B,N,F,T)->(B, N, T) (1)(32, 307, 1, 12) -> (32, 307, 12)
product = torch.matmul(lhs, rhs) # (B,T,N)(B,N,T)->(B,T,T) (32, 12, 307) * (32, 307, 12) -> (32, 12, 12)
# Then multiply Ve(T,T) with the output
# (T,T)(B, T, T)->(B,T,T) (12, 12) * (32, 12, 12) -> (32, 12, 12)
E = torch.matmul(self.Ve, torch.sigmoid(product + self.be)) # (B, T, T) (32, 12, 12)
E_normalized = F.softmax(E, dim=1) # (B, T, T) (32, 12, 12)
return E_normalized
二、Spatial_Attention_layer
S
=
V
s
⋅
σ
(
(
X
h
(
r
−
1
)
W
1
)
W
2
(
W
3
X
h
(
r
−
1
)
)
T
+
b
s
)
S=V_s \cdot \sigma ( (\mathcal{X}^{(r-1)}_{h}W_1) \ W_2\ (W_3\mathcal{X}^{(r-1)}_h)^T \ +b_s)
S=Vs⋅σ((Xh(r−1)W1) W2 (W3Xh(r−1))T +bs)
S
i
,
j
′
=
e
x
p
(
S
i
,
j
)
∑
j
=
1
N
e
x
p
(
S
i
,
j
)
S'_{i,j}=\frac{exp(S_{i,j})}{\sum^N_{j=1}exp(S_{i,j})}
Si,j′=∑j=1Nexp(Si,j)exp(Si,j)
C r − 1 C_{r-1} Cr−1是第r层的数据输入的通道大小(=特征F)。
σ \sigma σ是激活函数
S(注意力矩阵)是根据该层的当前输入动态计算的。
S i , j ∈ S S_{i,j}\in S Si,j∈S 表示节点i&节点j 之间的相关强度。然后使用softmax函数确保节点的attention weights的和为1.
特别的:
当 r = 1 , C 0 = F r=1,C_0=F r=1,C0=F时, T r − 1 T_{r-1} Tr−1 时第r层的时间维度的长度
当 r = 1 , T 0 = T h r=1,T_0=T_h r=1,T0=Th时(或 T 0 = T d T_0=T_d T0=Td,或 T 0 = T w T_0=T_w T0=Tw), V s , b s ∈ R N × > N , W 1 ∈ R T r − 1 , W 2 ∈ R C r − 1 × T r − 1 , W 3 ∈ R C r − 1 V_s,b_s \in R^{N\times >N},W_1 \in R^{T_{r-1}},W_2\in R^{C_{r-1}\times T_{r-1}},W_3 \in R^{C_{r-1}} Vs,bs∈RN×>N,W1∈RTr−1,W2∈RCr−1×Tr−1,W3∈RCr−1都是可学习的参数。
输入: X h ( r − 1 ) = ( X 1 , X 2 , . . . , X T r − 1 ) ∈ R N × C r − 1 × T r − 1 \mathcal{X}^{(r-1)}_h=(X_1,X_2,...,X_{T_{r-1}}) \in R^{N\times C_{r-1} \times T_{r-1}} Xh(r−1)=(X1,X2,...,XTr−1)∈RN×Cr−1×Tr−1 ,也是spatial-temporal block 的第r次输入
输出: S ′ S' S′
作用:
- 实例化参数
变量 | 类型 | 举例 | 用途 |
---|---|---|---|
DEVICE | str | “cpu” | |
in_channels | int | 1 | 输入中特征的维度F |
num_of_verticese | int | 307 | 顶点个数 |
num_of_timesteps | int | 12 | 时间轴的维度T |
- 函数流程
def __init__
1. 按照nn.Module的方式进行初始化
2. 设置可以学习的参数结构并放到DEVICE
W1:(T,) =(12,)
W2:(F,T) =(1, 12)
W3:(F,) =(1,)
bs: (1, N, N) =(1,307,307)
Vs:(N, N) =(307,307)
def forward
3. 输入变量X:shape=(B, N, F, T).与train_loader,val_loader,test_loader的形状类似
4. lhs = X * W1,再*W2. (B,N,F,T) *(T,)=(B, N,F);再乘时,(B,N,F)*(F,T)=(B,N,T)
5. rhs= (W3*X)^T. (F,)*(B, N,F,T) =(B,N,T)规则是(B,F)看作batch不变,(F,)*(F,T)=(T,)
^T的作用是为了方便lhs*rhs, (B,N,T) =(B,未知,未知)=(B, N,N)
所以rhs=(B,T,N)<--(B,N,T),只需要对最后两个轴交换位置即可(transpose(-1,-2))
6. product=lhs*rhs shape is (B, N, N)
7. S = Vs * sigmoid(product+bs) shape is (B, N, N)
8. S_normaled = softmax(S)
class Spatial_Attention_layer(nn.Module):
def __init__(self, DEVICE, in_channels, num_of_vertices, num_of_timesteps):
super(Spatial_Attention_layer, self).__init__()
self.W1 = nn.Parameter(torch.FloatTensor(num_of_timesteps).to(DEVICE)) # (12)
self.W2 = nn.Parameter(torch.FloatTensor(in_channels, num_of_timesteps).to(DEVICE)) # (1, 12)
self.W3 = nn.Parameter(torch.FloatTensor(in_channels).to(DEVICE)) # (1)
self.bs = nn.Parameter(torch.FloatTensor(1, num_of_vertices, num_of_vertices).to(DEVICE)) # (1,307, 307)
self.Vs = nn.Parameter(torch.FloatTensor(num_of_vertices, num_of_vertices).to(DEVICE)) # (307, 307)
def forward(self, x):
'''
Making a forward pass of the spatial attention layer.
Parameters
----------
x: mx.ndarray, x^{(r - 1)}_h,
shape is (batch_size, N_nodes, C_{r-1}, T_{r-1})
这里的C_{r-1}是指第r个STblock的输入数据在特征F的值,当r=1时,F=1
这里的T_{r-1}是指第r个STblock的输入数据在时间T的值,当r=1时,T=12
Returns
----------
S_normalized: mx.ndarray, S', spatial attention scores
shape is (batch_size, N_nodes, N_nodes)
'''
# compute spatial attention scores
# shape of lhs is (batch_size, V, T) # x^{(r - 1)}_h*W1*W2
# multiply with W1 (B, N, F_in, T)(T) -> (B,N,F_in)
# multiply with W2 (B,N,F_in)(F_in,T)->(B,N,T)
# (32, 307, 1, 12) * (12) -> (32, 307, 1) * (1, 12) -> (32, 307, 12)
lhs = torch.matmul(torch.matmul(x, self.W1), self.W2)
# shape of rhs is (batch_size, T, V) # x^{(r - 1)}_h*W3
# multiple W3 with X (F)(B,N,F,T)->(B, N, T)
# transpose (B, N, T) -> (B, T, N)
# (1)(32, 307, 1, 12) -> (32, 307, 12) -transpose-> (32, 12, 307)
rhs = torch.matmul(self.W3, x).transpose(-1, -2)
# shape of product is (batch_size, V, V)
# (B,N,T)(B,T, N)->(B,N,N) (32, 307, 12) * (32, 12, 307) -> (32, 307, 307)
product = torch.matmul(lhs, rhs)
# Then multiply Vs(N,N) with the output
# (N,N)(B, N, N)->(B,N,N) (307, 307) * (32, 307, 307) -> (32, 307, 307)
S = torch.matmul(self.Vs, torch.sigmoid(product + self.bs))
# normalization
'''
S = S - nd.max(S, axis=1, keepdims=True)
exp = nd.exp(S)
S_normalized = exp / nd.sum(exp, axis=1, keepdims=True) # 保证权值问题
return S_normalized
'''
S_normalized = F.softmax(S, dim=1) # (32, 307, 307)
叁、切比雪夫图卷积的制造
一、cheb_conv_withSAt
该函数是基于注意力的时空图卷积算子的切比雪夫谱图卷积算子
g
θ
∗
G
g_{\theta}*G
gθ∗G
g
θ
∗
G
x
=
g
θ
(
L
)
x
=
∑
k
=
0
K
−
1
θ
k
(
T
k
)
(
L
~
⊙
S
′
)
x
g_{\theta}*Gx=g_{\theta}(L)x=\sum_{k=0}^{K-1}\theta_k(T_k)(\tilde{L}\odot S')x
gθ∗Gx=gθ(L)x=k=0∑K−1θk(Tk)(L~⊙S′)x
L
~
=
2
L
λ
max
−
I
\mathbf{\tilde{L}}=\frac{2\mathbf{L}}{\lambda_{\max}} - \mathbf{I}
L~=λmax2L−I denotes the scaled and normalized Laplacian
where
- S ′ S' S′ :(B, N, N) 空间注意力矩阵
- θ k \theta_k θk:(F_in, F_out)
- T k ( L ~ ) T_k(\tilde{L}) Tk(L~): (N, N) 切比雪夫多项式系数
- x x x: (B, N ,F_in, T) 图信号矩阵,作为输入数据
- 结果: (B, N, F_out, T)
- 初始化变量
变量 | 类型 | 举例 | 用途 |
---|---|---|---|
K | int | 3 | 切比雪夫图卷积的阶 |
cheb_polynomials | fun | 见secA | 返回切比雪夫多项式的 系数列表 |
in_channels | int | 1 | 输入数据的特征这一轴的维度值F_in |
out_channels | int | 64 | 输出数据特征F_out;nb_chev_filter |
- 函数流程
def __init__
1. 按照nn.Module的方式进行初始化
2. 设置类的属性K, cheb_polynomials,in_channels,out_channels,DEVICE
3. 设置参数列表 Theta 并初始化;列表中参数共有K个,每个的shape为(F_in, F_out)
def forward
4. 输入X:(B, N, F_in, T) ; Spatial_attention:(B, N, N)
5. for time in range(T):
graph_signal = x[:, :, :, time_step] (B, N, F_in, )
output:初始设置为(B, N, F_out)的全0矩阵,用作累计求和
for k in range(self.K):
T_k:(N, N)第k项切比雪夫多项式系数
T_k_with_at: =T_k*S' ;(B, N, N) 是逐元素乘法
rhs:=T_k_with_at*x ; (B, N, F_in) =(B, N, N)*(B, N, F_in)
theta_k:(F_in,F_out)第k项Theta中的参数
output=output+ rhs theta_k ;(B,N,F_out)= (B,N,F_in)* (F_in,F_out)
将output在最后一轴升维后(B,N,F_out,1),放入列表outputs中
6. 将outputs中的元素按照axis=-1进行合并,得到结果outputs_new为(B,N,F_out,T).这样从5-7完成了不动T轴下的图卷积工作
7. return(Felu(outputs_new))
- 输出数据:outputs_new为(B,N,F_out,T).
class cheb_conv_withSAt(nn.Module):
def __init__(self, K, cheb_polynomials, in_channels, out_channels):
super(cheb_conv_withSAt, self).__init__()
self.K = K
self.cheb_polynomials = cheb_polynomials
self.in_channels = in_channels
self.out_channels = out_channels
self.DEVICE = cheb_polynomials[0].device
self.Theta = nn.ParameterList([nn.Parameter(torch.FloatTensor(in_channels, out_channels).to(self.DEVICE)) for _ in range(K)])
def forward(self, x, spatial_attention): # x: torch.FloatTensor
'''
Chebyshev graph convolution operation
:param x: (batch_size, N, F_in, T) 64个 N行 F_in列深度为T的三维张量
:return: (B, N, F_out, T) Hidden state tensor for all nodes
'''
batch_size, num_of_vertices, in_channels, num_of_timesteps = x.shape
outputs = []
for time_step in range(num_of_timesteps):
graph_signal = x[:, :, :, time_step] # (b, N, F_in)
output = torch.zeros(batch_size, num_of_vertices, self.out_channels).to(self.DEVICE) # (b, N, F_out)
for k in range(self.K):
T_k = self.cheb_polynomials[k] # (N,N)
T_k_with_at = T_k.mul(spatial_attention) # (N,N)*(B,N,N) = (B,N,N) 多行和=1, 按着列进行归一化
theta_k = self.Theta[k] # (in_channel, out_channel)
rhs = T_k_with_at.permute(0, 2, 1).matmul(graph_signal) # (B,N, N)(B, N, F_in) = (B, N, F_in)
output = output + rhs.matmul(theta_k) # (b, N, F_in)(F_in, F_out) = (b, N, F_out)
outputs.append(output.unsqueeze(-1)) # (b, N, F_out, 1)
return F.relu(torch.cat(outputs, dim=-1)) # (b, N, F_out, T) cat = concatnate 拼接,dim=-1表示沿着最后一个维度扩张
二、cheb_conv
该函数是没有基于注意力的时空图卷积算子的切比雪夫谱图卷积算子
g
θ
∗
G
g_{\theta}*G
gθ∗G。算法流程与前面相似。
g
θ
∗
G
x
=
g
θ
(
L
)
x
=
∑
k
=
0
K
−
1
θ
k
(
T
k
)
(
L
~
)
x
g_{\theta}*Gx=g_{\theta}(L)x=\sum_{k=0}^{K-1}\theta_k(T_k)(\tilde{L})x
gθ∗Gx=gθ(L)x=k=0∑K−1θk(Tk)(L~)x
class cheb_conv(nn.Module):
'''
K-order chebyshev graph convolution
'''
def __init__(self, K, cheb_polynomials, in_channels, out_channels):
'''
:param K: int
:param in_channles: int, num of channels in the input sequence
:param out_channels: int, num of channels in the output sequence
'''
super(cheb_conv, self).__init__()
self.K = K
self.cheb_polynomials = cheb_polynomials
self.in_channels = in_channels
self.out_channels = out_channels
self.DEVICE = cheb_polynomials[0].device
self.Theta = nn.ParameterList([nn.Parameter(torch.FloatTensor(in_channels, out_channels).to(self.DEVICE)) for _ in range(K)])
def forward(self, x):
'''
Chebyshev graph convolution operation
:param x: (batch_size, N, F_in, T)
:return: (batch_size, N, F_out, T)
'''
batch_size, num_of_vertices, in_channels, num_of_timesteps = x.shape
outputs = []
for time_step in range(num_of_timesteps):
graph_signal = x[:, :, :, time_step] # (b, N, F_in)
output = torch.zeros(batch_size, num_of_vertices, self.out_channels).to(self.DEVICE) # (b, N, F_out)
for k in range(self.K):
T_k = self.cheb_polynomials[k] # (N,N)
theta_k = self.Theta[k] # (in_channel, out_channel)
rhs = graph_signal.permute(0, 2, 1).matmul(T_k).permute(0, 2, 1)
output = output + rhs.matmul(theta_k)
outputs.append(output.unsqueeze(-1))
return F.relu(torch.cat(outputs, dim=-1))
A. lib.utlis的cheb_polynomial
计算切比雪夫多项式的系数from T_0 to T_{K-1}并返回系数列表
- 函数流程
1. 从矩阵中获得顶点个数N,并生成单位矩阵
2. 将 单位矩阵(L^0) 和 L(L^1) 放入列表 cheb_polynomials
3. 根据 切比雪夫系数迭代公式,获得从2,...,K阶的系数并放入 列表cheb_polynomials
4. 返回切比雪夫多项式的 系数列表
def cheb_polynomial(L_tilde, K):
'''
L_tilde: scaled Laplacian, np.ndarray, shape (N, N)
K: the maximum order of chebyshev polynomials 切比雪夫多项式的最大阶
Returns
----------
cheb_polynomials: list(np.ndarray), length: K, from T_0 to T_{K-1}
'''
N = L_tilde.shape[0] # 行数307
cheb_polynomials = [np.identity(N), L_tilde.copy()] # [N行单位矩阵,L~] .copy:浅拷贝和深拷贝
for i in range(2, K): # K = 3,使用 K 阶切比雪夫多项式
# 切比雪夫多项式的递归:T_k(x)= 2x * T_k-1(x) - T_k-2(x) *:是哈达玛积,对应位置相乘
cheb_polynomials.append(2 * L_tilde * cheb_polynomials[i - 1] - cheb_polynomials[i - 2])
return cheb_polynomials
肆、时空块
一、ASTGCN_block
- 实例化变量
变量 | 类型 | 举例 | 用途 |
---|---|---|---|
DEVICE | str | cpu | |
in_channels | int | 1 | F_in |
K | int | 3 | 切比雪夫多项式的阶 |
nb_chev_filter | int | 64 | 切比雪夫模块的Filter的数目 |
nb_time_filter | int | 64 | 时间模块的Filter的数目 |
time_strides | int | 1 | 时间卷积层中在时间轴上滑动的距离 |
cheb_polynomials | np.array | adj_mx | 节点的邻接矩阵用于多项式 |
num_of_vertices | int | 307 | ifPEMS04,so顶点个数为307 |
num_of_timesteps | int | 12 | 数据的时间轴的维度T=12 |
- 函数流程
def __init__
1. 采用nn.Module的方式进行初始化
2. 定义不同功能的神经网络层
self.TAt :时间注意力层,返回E'.shape=(B,T,T)
self.SAt :空间注意力层,返回S'.shape=(B,N,N)
self.cheb_conv_SAt:带空间注意力的切比雪夫卷积算子层,返回X'.shape=(B,N,nb_cheb_filter,T)
self.time_conv:时间卷积层(借用torch自带的二维卷积层),返回X''.shape=(B,nb_time_filter,N,T)
self.residual_conv:残差卷积层(借用torch自带的二维卷积层)
self.ln:层归一化的层(借用torch自带的LayerNorm)
def forward
3. 输入数据X:(B,N,F_in,T)
4. 具体过程见流程图更清晰点
class ASTGCN_block(nn.Module):
'''
Args:
in_channels (int): Number of input features.
K (int): Order of Chebyshev polynomials. Degree is K-1.
time_strides (int): Time strides during temporal convolution.
num_of_timesteps (int): Number of time lags.
'''
def __init__(self, DEVICE, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials, num_of_vertices, num_of_timesteps):
super(ASTGCN_block, self).__init__()
self.TAt = Temporal_Attention_layer(DEVICE, in_channels, num_of_vertices, num_of_timesteps)
self.SAt = Spatial_Attention_layer(DEVICE, in_channels, num_of_vertices, num_of_timesteps)
self.cheb_conv_SAt = cheb_conv_withSAt(K, cheb_polynomials, in_channels, nb_chev_filter)
# 定义卷积核及参数
self.time_conv = nn.Conv2d(nb_chev_filter, nb_time_filter,
kernel_size=(1, 3), stride=(1, time_strides), padding=(0, 1))
self.residual_conv = nn.Conv2d(in_channels, nb_time_filter, # 1 ,64
kernel_size=(1, 1), stride=(1, time_strides))
self.ln = nn.LayerNorm(nb_time_filter) # 需要将channel放到最后一个维度上
def forward(self, x):
'''
:return: (batch_size, N, nb_time_filter, T)
'''
batch_size, num_of_vertices, num_of_features, num_of_timesteps = x.shape # (32, 307, 1, 12)
temporal_At = self.TAt(x)
x_TAt = torch.matmul(x.reshape(batch_size, -1, num_of_timesteps), temporal_At).reshape(batch_size, num_of_vertices, num_of_features, num_of_timesteps)
# cheb gcn with spatial attention
# SAt:堆叠时间维度标准卷积层,将x换成引入时间注意力之后的.(单纯的卷积运算堆叠?)
spatial_At = self.SAt(x_TAt) # (B,N,N) for example (32, 307, 307)
# cheb gcn
spatial_gcn = self.cheb_conv_SAt(x, spatial_At) # (b,N,F,T)
# 时域图卷积,沿着时间维度卷积
time_conv_output = self.time_conv(spatial_gcn.permute(0, 2, 1, 3))
x_residual = self.residual_conv(x.permute(0, 2, 1, 3))
x_residual_out = self.ln(F.relu(x_residual + time_conv_output).permute(0, 3, 2, 1)).permute(0, 2, 3, 1)
return x_residual_out # (b,N,F,T) (32, 307, 64,12)
注释1:nn.Conv2d
时间卷积层的形状
self.time_conv = nn.Conv2d(nb_chev_filter, nb_time_filter, kernel_size=(1, 3), stride=(1, time_strides), padding=(0, 1))
等价于
nn.Conv2d(64,64,kernel_size=(1,3),stride=(1,12),padding=(0,1))
nb_chev_filter=64
是输入数据的特征,在conv中输入4维张量【N, C, H, W】中C表示channel的意思也就是特征的维度,因此要将输入数据spatial_gcn【shape=(B,N,nb_cheb_filter,T)】进行转置,变为【shape=(B,nb_cheb_fileter,N,T)】nb_time_filter
是输出数据的特征,
H_in=307 → \to →H_out=307
W_in=12 → \to →W_out=12
输出数据的shape=(B,nb_time_filter,N,T_out)=(32,64,307,12)
- 作代码验证如下。
class Net(nn.Module):
def __init__(self):
nn.Module.__init__(self)
self.time_conv = nn.Conv2d(64, 64,
kernel_size=(1, 3), stride=(1, 1)
, padding=(0, 1))
def forward(self, x):
return self.time_conv(x)
注释2:残差层的作用
低维数据流经非线性激活层会发生数据坍塌(信息丢失)。维度低的数据其实就是这么一种情况:其信息的冗余度高的可能性本来就低,如果强行对其进行非线性激活(维度压缩),则很有可能丢失掉有用信息,甚至丢失掉全部信息(输出为全 0)。与非线性激活层不同的是,线性激活层并不压缩特征空间的维度。于是,我们得到了一条使用激活层的原则:
- 对含有冗余信息的数据使用非线性激活(如 ReLU),对不含冗余信息的数据使用线性激活(如一些线性变换)。
- 两种类型的激活交替灵活使用,以同时兼顾非线性和信息的完整性。
- 由于冗余信息和非冗余信息所携带的有用信息是一样多的,因此在设计网络时,对内存消耗大的结构最好是用在非冗余信息上。
Conv2d的shape解释见注释1
nn.Conv2d(in_channels, nb_time_filter, kernel_size=(1, 1), stride=(1, time_strides))
#1,64- H_in=307
→
\to
→H_out=307
W_in=12 → \to →W_out=12 - 输出数据的shape=(B,nb_time_filter,N,T_out)=(32,64,307,12)
注释3:层归一化LayerNorm
-
《深析LayerNorm的原理》 ☀️,原理看这里
-
《BatchNorm、LayerNorm详细过程及示例》 👍 常用的两种归一化的区别和演示图,看这里
-
《Batch Norm / Layer Norm / Instance Norm / Group Norm 归一化方法》 😃 所有的归一化方法
LayerNorm的形状解释
self.ln = nn.LayerNorm(nb_time_filter)
:nb_time_filter=64,只有一个参数,说明只对输入数据的最后一个维度进行归一化。
x_residual_out = self.ln(F.relu(x_residual + time_conv_output).permute(0, 3, 2, 1)).permute(0, 2, 3, 1)
- F.relu后的out.shape=(B,nb_time_filter,N,1),例如=(32,64,307,1)
- 经permute转置后得out.shape=(32,1,307,64).只将axis=1和axis=3改变位置了。64在卷积网络中指得是channel在数据中指得是特征维度。也就是LayerNorm对数据的特征进行归一化。且归一化的数据形状不变(32,1,307,64)。
- 第一次转置是为了方便LayerNorm,第二次转置则是为了与输入数据的结构对应,方便ASTGCN模块的串联。
- 因此最终的输出数据x_residual_out.shape=(B,N,nb_time_filter,1),例如(32,307,64,1)
二、ASTGCN_submodule
- 数据流转
- 初始化变量
变量 | 类型 | 举例 | 用途 |
---|---|---|---|
DEVICE | str | cpu | |
nb_block | int | 2 | ASTGCN模块的个数 |
in_channels | int | 1 | 数据中特征的维度F_in |
K | int | 3 | 切比雪夫不等式的阶 |
nb_chev_filter | int | 64 | cheb卷积层中输出特征的维度 |
nb_time_filter | int | 64 | 时间卷积层中输出特征的维度 |
time_strides | int | 1 | 时间卷积层中在时间轴上滑动的距离 |
cheb_polynomials | np.array | adj_mx | 节点的邻接矩阵用于多项式 |
num_for_predict | int | 12 | 最后的输出数据在时间轴上的维度 |
len_input | int | 12 | 等价于num_of_timesteps,输入数据的时间轴上的维度 |
num_of_vertices | int | 307 | ifPEMS04,so顶点个数为307 |
- 函数流程
def __init__
1. 使用nn.Module的方式进行初始化
2. 通过self.to(DEVICE)将整个模型加载到DEVICE上
3. 使用nn.ModuleList进行子模型初始化,里面包含2个ASTGCNmodeule
第一个:ASTGCN_block(DEVICE, in_channels, K, nb_chev_filter, nb_time_filter,
time_strides, cheb_polynomials, num_of_vertices, len_input)
第二个:ASTGCN_block(DEVICE, nb_time_filter, K, nb_chev_filter, nb_time_filter,
1,cheb_polynomials, num_of_vertices, len_input//time_strides
4. 初始化一个2维卷积网络:self.final_conv = nn.Conv2d
nn.Conv2d(int(len_input/time_strides), num_for_predict, kernel_size=(1, nb_time_filter))
def forward
5. 首先将nn.ModuleList中的子模块按照列表的顺序串联
6. 最后输入一个Conv2d(),问原文中不是全连接层的吗??
class ASTGCN_submodule(nn.Module):
def __init__(self, DEVICE, nb_block, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials, num_for_predict, len_input, num_of_vertices):
super(ASTGCN_submodule, self).__init__()
self.BlockList = nn.ModuleList([ASTGCN_block(DEVICE, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials, num_of_vertices, len_input)])
self.BlockList.extend([ASTGCN_block(DEVICE, nb_time_filter, K, nb_chev_filter, nb_time_filter, 1,cheb_polynomials, num_of_vertices, len_input//time_strides)
for _ in range(nb_block-1)])
# nb_block= 2,所以self.BlockList中只有2个ASTGCN
#
self.final_conv = nn.Conv2d(int(len_input/time_strides), num_for_predict, kernel_size=(1, nb_time_filter))
self.DEVICE = DEVICE
self.to(DEVICE)
def forward(self, x):
'''
:param x: (B, N_nodes, F_in, T_in)
:return: (B, N_nodes, T_out)
'''
# 遍历每一个时空块
for block in self.BlockList:
x = block(x)
# x.shape=(B,F,N,T)--permute-->(B,T,N,F)--降维-->(B,T,N)--permute-->(B,N,T)
output = self.final_conv(x.permute(0, 3, 1, 2))[:, :, :, -1].permute(0, 2, 1)
return output # (b,N,T) (32, 307, 12)
注释1. nn.ModuleList
nn.ModuleList中的通过nn.module的方法定义的子模型(或着说神经网络?)会自动初始化,但是没有像Sequential中定义好网络的先后顺序。所以子模型之间的搭配顺序要自己去定义。
注释2:2个ASTGCN的对比
含义 | ASTGCN1变量 | 值 | ASTGCN2变量 | 值 |
---|---|---|---|---|
输入数据的特征的维度 | in_channels | 1 | nb_time_filter | 64 |
cheb层输出的特征 | nb_cheb_filter | 64 | nb_cheb_filter | 64 |
time层输出的特征 | nb_time_filter | 64 | nb_time_filter | 64 |
time层的滑动步伐 | time_strides | 1 | 1 |
- 经过LayerNorm后变为(32,307,64,12)
注释3. final_conv
class Net(nn.Module):
def __init__(self):
nn.Module.__init__(self)
self.final_conv=nn.Conv2d(int(12/1),
12, kernel_size=(1, 64))
def forward(self, x):
return self.final_conv(x)
测试: