【ASTGCN之1个特征】模型解读(torch)之模型框架(三)

前文:


import torch
import torch.nn as nn
import torch.nn.functional as F
from lib.utils import scaled_Laplacian, cheb_polynomial

壹、模型框架

一、文件内容

查看以下该.py文件中的class类,并确定class之间的关系。
在这里插入图片描述

二、算法流程

三、make_model

  • 输入变量
变量(本函数)又名(传递)类型举例用途
DEVICEstr“cpu”模型运行的设备
nb_blockint2ST-Block的个数在这里插入图片描述
in_channelsint1输入特征F的维度
Kint3切比雪夫多项式的阶
nb_chev_filterint64cheb卷积层中输出特征的维度
nb_time_filterint64时间卷积层中输出特征的维度
time_strides=num_of_hoursint1时间卷积层中在时间轴上滑动的距离
cheb_polynomialsnp.arrayadj_mx用于多项式的矩阵
nb_predict_step=num_for_predictint12用于预测多长时间的结果,if12,so预测1小时
len_inputnum_of_timestepsint12是指时间维度T的值❓
num_of_verticesint307if PEMS04,so顶点个数为307
def make_model(DEVICE, nb_block, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, adj_mx, num_for_predict, len_input, num_of_vertices):

    L_tilde = scaled_Laplacian(adj_mx)
    cheb_polynomials = [torch.from_numpy(i).type(torch.FloatTensor).to(DEVICE)   # 将数组(ndarray)i 转换为tensor
                        for i in cheb_polynomial(L_tilde, K)]   #  i = 0,1,2(3组矩阵)
    model = ASTGCN_submodule(DEVICE, nb_block, in_channels, K, nb_chev_filter, nb_time_filter,
                             time_strides, cheb_polynomials, num_for_predict, len_input, num_of_vertices)


# this code is very important!
# It initialises the parameters with a range of values that stops the signal fading or getting too big.
    # Initialize parameters with Glorot / fan_avg.
    # Resetting the parameters.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)    # Xavier 初始化确保权重“恰到好处”
        else:
            nn.init.uniform_(p)

    return model

贰、时间和空间注意力层

一、Temporal_Attention_layer

通过该module获得时间注意力矩阵E_normalized.
E = V e ⋅ σ ( ( ( E=V_e \cdot \sigma (( ( E=Veσ((( X h ( r − 1 ) \mathcal{X}^{(r-1)}_{h} Xh(r1) ) T U 1 )   U 2   ( U 3 X h ( r − 1 ) )   + b e ) )^T U_1) \ U_2\ (U_3\mathcal{X}^{(r-1)}_h) \ +b_e) )TU1) U2 (U3Xh(r1)) +be)
E i , j ′ = e x p ( E i , j ) ∑ j = 1 T r − 1 e x p ( E i , j ) E'_{i,j}=\frac{exp(E_{i,j})}{\sum^{T_{r-1}}_{j=1}exp(E_{i,j})} Ei,j=j=1Tr1exp(Ei,j)exp(Ei,j)

where,

V e , b e ∈ R T r − 1 × T r − 1 V_e,b_e \in R^{T_{r-1}\times T_{r-1}} Ve,beRTr1×Tr1 U 1 ∈ R N U_1 \in R^N U1RN, U 2 ∈ R C r − 1 × N U_2 \in R^{C_{r-1}\times N} U2RCr1×N U 3 ∈ R C r − 1 U_3 \in R^{C_{r-1}} U3RCr1
E i , j ∈ E E_{i,j} \in E Ei,jE在语义上表示节点i&j之间的的依赖关系强度。最后,E被softmax函数规范化。
输入: X h ( r − 1 ) = ( X 1 , X 2 , . . . , X T r − 1 ) ∈ R N × C r − 1 × T r − 1 \mathcal{X}^{(r-1)}_h=(X_1,X_2,...,X_{T_{r-1}}) \in R^{N\times C_{r-1} \times T_{r-1}} Xh(r1)=(X1,X2,...,XTr1)RN×Cr1×Tr1 r − 1 r-1 r1表示第 r r r
输出: E ′ E' E
作用: X ~ h ( r − 1 ) = ( X 1 ~ , X 2 ~ , . . . , X ~ T r − 1 ) = ( X 1 , X 2 , . . . , X T r − 1 ) E ′ ∈ R N × C r − 1 × T r − 1 \tilde{\mathcal{X}}^{(r-1)}_h=(\tilde{X_1},\tilde{X_2},...,\tilde{X}_{T_{r-1}}) =(X_1,X_2,...,X_{T_{r-1}})E' \in R^{N\times C_{r-1} \times T_{r-1}} X~h(r1)=(X1~,X2~,...,X~Tr1)=(X1,X2,...,XTr1)ERN×Cr1×Tr1

  • 实例化参数
变量类型举例用途
DEVICEstr“cpu”将变量和模型放到设备上运行
in_channelsint1输入特征的维度F
num_of_verticesint307顶点个数
num_of_timestepsint12时间轴的数目T=12
  • 函数流程
def  _init__
1. 初始化
2. 设置可学习参数的结构,并放到DEVICE上
    U1:(N, ) =(307,)
    U2:(F,N) =(1,307)
    U3:(F, ) =(1,)
    be:(1,T,T) = (1, 12, 12)
    Ve:(T,T)   = (12, 12)
def forward
3.  输入变量x:shape=(B, N, F, T).与train_loader,val_loader,test_loader的形状类似
4. lhs= (x^T)*U1,再*U2.
               x作转置是为了可以与先U1相乘,因此x^T=(B,不确定,不确定,N)
               (x^T)*U=(B,不确定,不确定),为了继续可以和U2=(F,N)相乘,因此(B,T,F)
               所以,x的转职操作有x.permute(0, 3, 2, 1)即(B,N,F,T)->(B,T,F,N)。
               (B,T,F,N)*(N,)*(F,N)=(B,T,F)*(F,N)=(B,T,N)
5. rhs=U3*x.    (F,)*(B,N,F,T) =(B,N,T)
6. product=lhs*rhs   (B,T,N)*(B,N,T)=(B,T,T) l:left,r:right
7. E = Ve  *  sigmoid(product+be)  shape is (B,T,T)
8. E_normalized = softmax(E)       shape is (B,T,T)
class Temporal_Attention_layer(nn.Module):
    def __init__(self, DEVICE, in_channels, num_of_vertices, num_of_timesteps):
        super(Temporal_Attention_layer, self).__init__()
        # 按照nn.Module的初始化方法初始化
        self.U1 = nn.Parameter(torch.FloatTensor(num_of_vertices).to(DEVICE))   # 307个顶点(传感器)
        self.U2 = nn.Parameter(torch.FloatTensor(in_channels, num_of_vertices).to(DEVICE))  #(1, 307)
        self.U3 = nn.Parameter(torch.FloatTensor(in_channels).to(DEVICE))  # (1)
        self.be = nn.Parameter(torch.FloatTensor(1, num_of_timesteps, num_of_timesteps).to(DEVICE))
        self.Ve = nn.Parameter(torch.FloatTensor(num_of_timesteps, num_of_timesteps).to(DEVICE))

    def forward(self, x):
        '''
        # (32, 307, 1, 12) -permute(变换)-> (32, 12, 1, 307) * (307)=(32,12,1) -> (32, 12, 1) * (1, 307)
        '''
        _, num_of_vertices, num_of_features, num_of_timesteps = x.shape
        # lhs = left hand side embedding
        # (32, 12, 307) # one signal (mean of all detectors) for each timestamp then regenerate them
        lhs = torch.matmul(torch.matmul(x.permute(0, 3, 2, 1), self.U1), self.U2)
        # permute x:(B, N, F_in, T) -> (B, T, F_in, N)
        # multiply with U1 (B, T, F_in, N)(N) -> (B,T,F_in)
        # multiply with U2 (B,T,F_in)(F_in,N)->(B,T,N)
        # for example (32, 307, 1, 12) -permute-> (32, 12, 1, 307) * (307) -> (32, 12, 1) * (1, 307) -> (32, 12, 307)

        # rhs = right hand side embedding
        # one feature (mean of all features) for each detector
        rhs = torch.matmul(self.U3, x)
        # mutliple U3 with X (F)(B,N,F,T)->(B, N, T) (1)(32, 307, 1, 12) -> (32, 307, 12)

        product = torch.matmul(lhs, rhs)  # (B,T,N)(B,N,T)->(B,T,T) (32, 12, 307) * (32, 307, 12) -> (32, 12, 12)

        # Then multiply Ve(T,T) with the output
        # (T,T)(B, T, T)->(B,T,T) (12, 12) *  (32, 12, 12) ->   (32, 12, 12)
        E = torch.matmul(self.Ve, torch.sigmoid(product + self.be))  #  (B, T, T)  (32, 12, 12)
        E_normalized = F.softmax(E, dim=1)  # (B, T, T) (32, 12, 12)
        return E_normalized

二、Spatial_Attention_layer

S = V s ⋅ σ ( ( X h ( r − 1 ) W 1 )   W 2   ( W 3 X h ( r − 1 ) ) T   + b s ) S=V_s \cdot \sigma ( (\mathcal{X}^{(r-1)}_{h}W_1) \ W_2\ (W_3\mathcal{X}^{(r-1)}_h)^T \ +b_s) S=Vsσ((Xh(r1)W1) W2 (W3Xh(r1))T +bs)
S i , j ′ = e x p ( S i , j ) ∑ j = 1 N e x p ( S i , j ) S'_{i,j}=\frac{exp(S_{i,j})}{\sum^N_{j=1}exp(S_{i,j})} Si,j=j=1Nexp(Si,j)exp(Si,j)

C r − 1 C_{r-1} Cr1是第r层的数据输入的通道大小(=特征F)
σ \sigma σ是激活函数
S(注意力矩阵)是根据该层的当前输入动态计算的。
S i , j ∈ S S_{i,j}\in S Si,jS 表示节点i&节点j 之间的相关强度。然后使用softmax函数确保节点的attention weights的和为1.
特别的:
r = 1 , C 0 = F r=1,C_0=F r=1,C0=F时, T r − 1 T_{r-1} Tr1 时第r层的时间维度的长度
r = 1 , T 0 = T h r=1,T_0=T_h r=1,T0=Th时(或 T 0 = T d T_0=T_d T0=Td,或 T 0 = T w T_0=T_w T0=Tw), V s , b s ∈ R N × > N , W 1 ∈ R T r − 1 , W 2 ∈ R C r − 1 × T r − 1 , W 3 ∈ R C r − 1 V_s,b_s \in R^{N\times >N},W_1 \in R^{T_{r-1}},W_2\in R^{C_{r-1}\times T_{r-1}},W_3 \in R^{C_{r-1}} Vs,bsRN×>N,W1RTr1,W2RCr1×Tr1,W3RCr1都是可学习的参数。
输入: X h ( r − 1 ) = ( X 1 , X 2 , . . . , X T r − 1 ) ∈ R N × C r − 1 × T r − 1 \mathcal{X}^{(r-1)}_h=(X_1,X_2,...,X_{T_{r-1}}) \in R^{N\times C_{r-1} \times T_{r-1}} Xh(r1)=(X1,X2,...,XTr1)RN×Cr1×Tr1 ,也是spatial-temporal block第r次输入
输出: S ′ S' S
作用:

  • 实例化参数
变量类型举例用途
DEVICEstr“cpu”
in_channelsint1输入中特征的维度F
num_of_verticeseint307顶点个数
num_of_timestepsint12时间轴的维度T
  • 函数流程
def  __init__
1. 按照nn.Module的方式进行初始化
2. 设置可以学习的参数结构并放到DEVICE  
     W1:(T,)   =(12,)
     W2:(F,T)   =(1, 12)
     W3:(F,)    =(1,)
     bs: (1, N, N) =(1,307,307)
     Vs:(N, N)     =(307,307)
def forward
3. 输入变量X:shape=(B, N, F, T).与train_loader,val_loader,test_loader的形状类似
4. lhs = X * W1,再*W2.  (B,N,F,T) *(T,)=(B, N,F);再乘时,(B,N,F)*(F,T)=(B,N,T)
5. rhs= (W3*X)^T.  (F,)*(B, N,F,T) =(B,N,T)规则是(B,F)看作batch不变,(F,)*(F,T)=(T,)
                 ^T的作用是为了方便lhs*rhs, (B,N,T) =(B,未知,未知)=(B, N,N)
                 所以rhs=(B,T,N)<--(B,N,T),只需要对最后两个轴交换位置即可(transpose(-1,-2))
6. product=lhs*rhs    shape is (B, N, N)
7. S = Vs * sigmoid(product+bs)  shape is (B, N, N)
8. S_normaled = softmax(S)

class Spatial_Attention_layer(nn.Module):
  
    def __init__(self, DEVICE, in_channels, num_of_vertices, num_of_timesteps):
        super(Spatial_Attention_layer, self).__init__()
        self.W1 = nn.Parameter(torch.FloatTensor(num_of_timesteps).to(DEVICE))  # (12)
        self.W2 = nn.Parameter(torch.FloatTensor(in_channels, num_of_timesteps).to(DEVICE))   # (1, 12)
        self.W3 = nn.Parameter(torch.FloatTensor(in_channels).to(DEVICE))  # (1)
        self.bs = nn.Parameter(torch.FloatTensor(1, num_of_vertices, num_of_vertices).to(DEVICE))   # (1,307, 307)
        self.Vs = nn.Parameter(torch.FloatTensor(num_of_vertices, num_of_vertices).to(DEVICE))  # (307, 307)


    def forward(self, x):
        '''
        Making a forward pass of the spatial attention layer.
        Parameters
        ----------
        x: mx.ndarray, x^{(r - 1)}_h,
           shape is (batch_size, N_nodes, C_{r-1}, T_{r-1})
           这里的C_{r-1}是指第r个STblock的输入数据在特征F的值,当r=1时,F=1
           这里的T_{r-1}是指第r个STblock的输入数据在时间T的值,当r=1时,T=12

        Returns
        ----------
        S_normalized: mx.ndarray, S', spatial attention scores
                      shape is (batch_size, N_nodes, N_nodes)

        '''

        # compute spatial attention scores
        # shape of lhs is (batch_size, V, T)                   # x^{(r - 1)}_h*W1*W2
        # multiply with W1 (B, N, F_in, T)(T) -> (B,N,F_in)
        # multiply with W2 (B,N,F_in)(F_in,T)->(B,N,T)
        # (32, 307, 1, 12) * (12) -> (32, 307, 1) * (1, 12) -> (32, 307, 12)
        lhs = torch.matmul(torch.matmul(x, self.W1), self.W2)

        # shape of rhs is (batch_size, T, V)              #  x^{(r - 1)}_h*W3
        # multiple W3 with X (F)(B,N,F,T)->(B, N, T)
        # transpose  (B, N, T)  -> (B, T, N)
        # (1)(32, 307, 1, 12) -> (32, 307, 12) -transpose-> (32, 12, 307)
        rhs = torch.matmul(self.W3, x).transpose(-1, -2)

        # shape of product is (batch_size, V, V)
        # (B,N,T)(B,T, N)->(B,N,N) (32, 307, 12) * (32, 12, 307) -> (32, 307, 307)
        product = torch.matmul(lhs, rhs)

        # Then multiply Vs(N,N) with the output
        # (N,N)(B, N, N)->(B,N,N) (307, 307) *  (32, 307, 307) ->   (32, 307, 307)
        S = torch.matmul(self.Vs, torch.sigmoid(product + self.bs))

        # normalization
        '''
        S = S - nd.max(S, axis=1, keepdims=True)
        exp = nd.exp(S)
        S_normalized = exp / nd.sum(exp, axis=1, keepdims=True)  # 保证权值问题
        return S_normalized
        '''
        S_normalized = F.softmax(S, dim=1)    # (32, 307, 307)

叁、切比雪夫图卷积的制造

一、cheb_conv_withSAt

该函数是基于注意力的时空图卷积算子的切比雪夫谱图卷积算子 g θ ∗ G g_{\theta}*G gθG
g θ ∗ G x = g θ ( L ) x = ∑ k = 0 K − 1 θ k ( T k ) ( L ~ ⊙ S ′ ) x g_{\theta}*Gx=g_{\theta}(L)x=\sum_{k=0}^{K-1}\theta_k(T_k)(\tilde{L}\odot S')x gθGx=gθ(L)x=k=0K1θk(Tk)(L~S)x

L ~ = 2 L λ max ⁡ − I \mathbf{\tilde{L}}=\frac{2\mathbf{L}}{\lambda_{\max}} - \mathbf{I} L~=λmax2LI denotes the scaled and normalized Laplacian
where

  • S ′ S' S :(B, N, N) 空间注意力矩阵
  • θ k \theta_k θk:(F_in, F_out)
  • T k ( L ~ ) T_k(\tilde{L}) Tk(L~): (N, N) 切比雪夫多项式系数
  • x x x: (B, N ,F_in, T) 图信号矩阵,作为输入数据
  • 结果: (B, N, F_out, T)
  • 初始化变量
变量类型举例用途
Kint3切比雪夫图卷积的阶
cheb_polynomialsfun见secA返回切比雪夫多项式的 系数列表
in_channelsint1输入数据的特征这一轴的维度值F_in
out_channelsint64输出数据特征F_out;nb_chev_filter
  • 函数流程
def __init__
1. 按照nn.Module的方式进行初始化
2. 设置类的属性K, cheb_polynomials,in_channels,out_channels,DEVICE
3. 设置参数列表 Theta 并初始化;列表中参数共有K个,每个的shape为(F_in, F_out)
def  forward
4. 输入X:(B, N, F_in, T) ; Spatial_attention:(B, N, N)
5. for  time  in  range(T):
       graph_signal = x[:, :, :, time_step]   (B, N, F_in, )
       output:初始设置为(B, N, F_out)的全0矩阵,用作累计求和
       for  k in range(self.K):
              T_k:(N, N)第k项切比雪夫多项式系数
              T_k_with_at: =T_k*S' ;(B, N, N) 是逐元素乘法
              rhs:=T_k_with_at*x ; (B, N, F_in) =(B, N, N)*(B, N, F_in)
              theta_k:(F_in,F_out)第k项Theta中的参数
              output=output+ rhs theta_k   ;(B,N,F_out)= (B,N,F_in)* (F_in,F_out)
      将output在最后一轴升维后(B,N,F_out,1),放入列表outputs中
6. 将outputs中的元素按照axis=-1进行合并,得到结果outputs_new为(B,N,F_out,T).这样从5-7完成了不动T轴下的图卷积工作
7. return(Felu(outputs_new))
       
  • 输出数据:outputs_new为(B,N,F_out,T).
class cheb_conv_withSAt(nn.Module):
    def __init__(self, K, cheb_polynomials, in_channels, out_channels):
        super(cheb_conv_withSAt, self).__init__()
        self.K = K
        self.cheb_polynomials = cheb_polynomials
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.DEVICE = cheb_polynomials[0].device
        self.Theta = nn.ParameterList([nn.Parameter(torch.FloatTensor(in_channels, out_channels).to(self.DEVICE)) for _ in range(K)])

    def forward(self, x, spatial_attention):    # x: torch.FloatTensor
        '''
        Chebyshev graph convolution operation
        :param x: (batch_size, N, F_in, T)    64个 N行 F_in列深度为T的三维张量
        :return: (B, N, F_out, T)  Hidden state tensor for all nodes
        '''

        batch_size, num_of_vertices, in_channels, num_of_timesteps = x.shape
        outputs = []
        for time_step in range(num_of_timesteps):
            graph_signal = x[:, :, :, time_step]  # (b, N, F_in)
            output = torch.zeros(batch_size, num_of_vertices, self.out_channels).to(self.DEVICE)  # (b, N, F_out)
            for k in range(self.K):
                T_k = self.cheb_polynomials[k]  # (N,N)
                T_k_with_at = T_k.mul(spatial_attention)   # (N,N)*(B,N,N) = (B,N,N) 多行和=1, 按着列进行归一化
                theta_k = self.Theta[k]  # (in_channel, out_channel)
                rhs = T_k_with_at.permute(0, 2, 1).matmul(graph_signal)  # (B,N, N)(B, N, F_in) = (B, N, F_in) 
                output = output + rhs.matmul(theta_k)  # (b, N, F_in)(F_in, F_out) = (b, N, F_out)

            outputs.append(output.unsqueeze(-1))  # (b, N, F_out, 1)

        return F.relu(torch.cat(outputs, dim=-1))  # (b, N, F_out, T) cat = concatnate 拼接,dim=-1表示沿着最后一个维度扩张

二、cheb_conv

该函数是没有基于注意力的时空图卷积算子的切比雪夫谱图卷积算子 g θ ∗ G g_{\theta}*G gθG。算法流程与前面相似。
g θ ∗ G x = g θ ( L ) x = ∑ k = 0 K − 1 θ k ( T k ) ( L ~ ) x g_{\theta}*Gx=g_{\theta}(L)x=\sum_{k=0}^{K-1}\theta_k(T_k)(\tilde{L})x gθGx=gθ(L)x=k=0K1θk(Tk)(L~)x

class cheb_conv(nn.Module):
    '''
    K-order chebyshev graph convolution
    '''

    def __init__(self, K, cheb_polynomials, in_channels, out_channels):
        '''
        :param K: int
        :param in_channles: int, num of channels in the input sequence
        :param out_channels: int, num of channels in the output sequence
        '''
        super(cheb_conv, self).__init__()
        self.K = K
        self.cheb_polynomials = cheb_polynomials
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.DEVICE = cheb_polynomials[0].device
        self.Theta = nn.ParameterList([nn.Parameter(torch.FloatTensor(in_channels, out_channels).to(self.DEVICE)) for _ in range(K)])
    def forward(self, x):
        '''
        Chebyshev graph convolution operation
        :param x: (batch_size, N, F_in, T)
        :return: (batch_size, N, F_out, T)
        '''
        batch_size, num_of_vertices, in_channels, num_of_timesteps = x.shape
        outputs = []
        for time_step in range(num_of_timesteps):
            graph_signal = x[:, :, :, time_step]  # (b, N, F_in)
            output = torch.zeros(batch_size, num_of_vertices, self.out_channels).to(self.DEVICE)  # (b, N, F_out)
            for k in range(self.K):
                T_k = self.cheb_polynomials[k]  # (N,N)
                theta_k = self.Theta[k]  # (in_channel, out_channel)
                rhs = graph_signal.permute(0, 2, 1).matmul(T_k).permute(0, 2, 1)
                output = output + rhs.matmul(theta_k)
            outputs.append(output.unsqueeze(-1))
        return F.relu(torch.cat(outputs, dim=-1))

A. lib.utlis的cheb_polynomial

计算切比雪夫多项式的系数from T_0 to T_{K-1}并返回系数列表

  • 函数流程
1. 从矩阵中获得顶点个数N,并生成单位矩阵
2. 将 单位矩阵(L^0) 和 L(L^1) 放入列表 cheb_polynomials
3. 根据 切比雪夫系数迭代公式,获得从2,...,K阶的系数并放入 列表cheb_polynomials
4. 返回切比雪夫多项式的 系数列表
def cheb_polynomial(L_tilde, K):
    '''
    L_tilde: scaled Laplacian, np.ndarray, shape (N, N)
    K: the maximum order of chebyshev polynomials 切比雪夫多项式的最大阶
    Returns
    ----------
    cheb_polynomials: list(np.ndarray), length: K, from T_0 to T_{K-1}
    '''
    N = L_tilde.shape[0]  # 行数307
    cheb_polynomials = [np.identity(N), L_tilde.copy()]   # [N行单位矩阵,L~]   .copy:浅拷贝和深拷贝

    for i in range(2, K):   # K = 3,使用 K 阶切比雪夫多项式
        # 切比雪夫多项式的递归:T_k(x)= 2x * T_k-1(x) - T_k-2(x)    *:是哈达玛积,对应位置相乘
        cheb_polynomials.append(2 * L_tilde * cheb_polynomials[i - 1] - cheb_polynomials[i - 2])

    return cheb_polynomials

肆、时空块

一、ASTGCN_block

  • 实例化变量
变量类型举例用途
DEVICEstrcpu
in_channelsint1F_in
Kint3切比雪夫多项式的阶
nb_chev_filterint64切比雪夫模块的Filter的数目
nb_time_filterint64时间模块的Filter的数目
time_stridesint1时间卷积层中在时间轴上滑动的距离
cheb_polynomialsnp.arrayadj_mx节点的邻接矩阵用于多项式
num_of_verticesint307ifPEMS04,so顶点个数为307
num_of_timestepsint12数据的时间轴的维度T=12
  • 函数流程
def __init__
1. 采用nn.Module的方式进行初始化
2. 定义不同功能的神经网络层
    self.TAt :时间注意力层,返回E'.shape=(B,T,T)
    self.SAt :空间注意力层,返回S'.shape=(B,N,N)
    self.cheb_conv_SAt:带空间注意力的切比雪夫卷积算子层,返回X'.shape=(B,N,nb_cheb_filter,T)
    self.time_conv:时间卷积层(借用torch自带的二维卷积层),返回X''.shape=(B,nb_time_filter,N,T)
    self.residual_conv:残差卷积层(借用torch自带的二维卷积层)
    self.ln:层归一化的层(借用torch自带的LayerNorm)
def forward
3. 输入数据X:(B,N,F_in,T)
4. 具体过程见流程图更清晰点
get ST matrix
数据输入
获得时间注意力矩阵E'
数据输入
获得空间注意力矩阵S'
数据同时输入
数据同时输入
获得图卷积结果
数据B,N,nb_cheb_filter,T输入
获得时间卷积结果
数据输入
获得线性层的结果
数据B,nb_time_filter,N,T输入
数据B,nb_time_filter,N,T输入
数据输入
获得结果
数据x_TAt.shape=B,N.F_in,T
temporal_At
TAt
spatial_At
SAt
数据X.shape=B,N,F_in,T
cheb_conv_SAt
spatial_gcn.
time_conv
time_conv_output
residual_conv
x_residual
F.relu
nn.LayerNorm
x_residual:B,N,nb_time_fileter,T
ASTGCN模块的流程图
class ASTGCN_block(nn.Module):
    '''
    Args:
        in_channels (int): Number of input features.
        K (int): Order of Chebyshev polynomials. Degree is K-1.
        time_strides (int): Time strides during temporal convolution.
        num_of_timesteps (int): Number of time lags.
    '''
    def __init__(self, DEVICE, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials, num_of_vertices, num_of_timesteps):
        super(ASTGCN_block, self).__init__()
        self.TAt = Temporal_Attention_layer(DEVICE, in_channels, num_of_vertices, num_of_timesteps)
        self.SAt = Spatial_Attention_layer(DEVICE, in_channels, num_of_vertices, num_of_timesteps)
        self.cheb_conv_SAt = cheb_conv_withSAt(K, cheb_polynomials, in_channels, nb_chev_filter)
        # 定义卷积核及参数
        self.time_conv = nn.Conv2d(nb_chev_filter, nb_time_filter,       
                                   kernel_size=(1, 3), stride=(1, time_strides), padding=(0, 1))
                                     
        self.residual_conv = nn.Conv2d(in_channels, nb_time_filter,      # 1 ,64
                                       kernel_size=(1, 1), stride=(1, time_strides))
        self.ln = nn.LayerNorm(nb_time_filter)  # 需要将channel放到最后一个维度上

    def forward(self, x):
        '''
        :return: (batch_size, N, nb_time_filter, T)
        '''
        batch_size, num_of_vertices, num_of_features, num_of_timesteps = x.shape   # (32, 307, 1, 12)
        temporal_At = self.TAt(x)
        x_TAt = torch.matmul(x.reshape(batch_size, -1, num_of_timesteps), temporal_At).reshape(batch_size, num_of_vertices, num_of_features, num_of_timesteps)

        # cheb gcn with spatial attention
        # SAt:堆叠时间维度标准卷积层,将x换成引入时间注意力之后的.(单纯的卷积运算堆叠?)
        spatial_At = self.SAt(x_TAt)   # (B,N,N) for example (32, 307, 307)
        # cheb gcn
        spatial_gcn = self.cheb_conv_SAt(x, spatial_At)  # (b,N,F,T)
        # 时域图卷积,沿着时间维度卷积
        time_conv_output = self.time_conv(spatial_gcn.permute(0, 2, 1, 3))
        x_residual = self.residual_conv(x.permute(0, 2, 1, 3))
        x_residual_out = self.ln(F.relu(x_residual + time_conv_output).permute(0, 3, 2, 1)).permute(0, 2, 3, 1)
        return x_residual_out      # (b,N,F,T) (32, 307, 64,12)

注释1:nn.Conv2d

时间卷积层的形状
self.time_conv = nn.Conv2d(nb_chev_filter, nb_time_filter, kernel_size=(1, 3), stride=(1, time_strides), padding=(0, 1))等价于

nn.Conv2d(64,64,kernel_size=(1,3),stride=(1,12),padding=(0,1))
  • nb_chev_filter=64是输入数据的特征,在conv中输入4维张量【N, C, H, W】中C表示channel的意思也就是特征的维度,因此要将输入数据spatial_gcn【shape=(B,N,nb_cheb_filter,T)】进行转置,变为【shape=(B,nb_cheb_fileter,N,T)】
  • nb_time_filter是输出数据的特征,
    在这里插入图片描述
    H_in=307 → \to H_out=307
    W_in=12 → \to W_out=12

输出数据的shape=(B,nb_time_filter,N,T_out)=(32,64,307,12)

  • 作代码验证如下。
class Net(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.time_conv = nn.Conv2d(64, 64,      
                                kernel_size=(1, 3), stride=(1, 1)
                                , padding=(0, 1))
    def forward(self, x):
        return self.time_conv(x)

在这里插入图片描述

注释2:残差层的作用

低维数据流经非线性激活层会发生数据坍塌(信息丢失)。维度低的数据其实就是这么一种情况:其信息的冗余度高的可能性本来就低,如果强行对其进行非线性激活(维度压缩),则很有可能丢失掉有用信息,甚至丢失掉全部信息(输出为全 0)。与非线性激活层不同的是,线性激活层并不压缩特征空间的维度。于是,我们得到了一条使用激活层的原则:

  • 对含有冗余信息的数据使用非线性激活(如 ReLU),对不含冗余信息的数据使用线性激活(如一些线性变换)。
  • 两种类型的激活交替灵活使用,以同时兼顾非线性和信息的完整性。
  • 由于冗余信息和非冗余信息所携带的有用信息是一样多的,因此在设计网络时,对内存消耗大的结构最好是用在非冗余信息上。

Conv2d的shape解释见注释1

  • nn.Conv2d(in_channels, nb_time_filter, kernel_size=(1, 1), stride=(1, time_strides))#1,64
  • H_in=307 → \to H_out=307
    W_in=12 → \to W_out=12
  • 输出数据的shape=(B,nb_time_filter,N,T_out)=(32,64,307,12)
注释3:层归一化LayerNorm

LayerNorm的形状解释
self.ln = nn.LayerNorm(nb_time_filter):nb_time_filter=64,只有一个参数,说明只对输入数据的最后一个维度进行归一化。
x_residual_out = self.ln(F.relu(x_residual + time_conv_output).permute(0, 3, 2, 1)).permute(0, 2, 3, 1)

  • F.relu后的out.shape=(B,nb_time_filter,N,1),例如=(32,64,307,1)
  • 经permute转置后得out.shape=(32,1,307,64).只将axis=1和axis=3改变位置了。64在卷积网络中指得是channel在数据中指得是特征维度。也就是LayerNorm对数据的特征进行归一化。且归一化的数据形状不变(32,1,307,64)。
  • 第一次转置是为了方便LayerNorm,第二次转置则是为了与输入数据的结构对应,方便ASTGCN模块的串联。
  • 因此最终的输出数据x_residual_out.shape=(B,N,nb_time_filter,1),例如(32,307,64,1)

二、ASTGCN_submodule

  • 数据流转
32,307,1,12
32,307,64,12
32,307,64,12
32,307,12
input
ASTGCN1
ASTGCN2
FC
out
  • 初始化变量
变量类型举例用途
DEVICEstrcpu
nb_blockint2ASTGCN模块的个数
in_channelsint1数据中特征的维度F_in
Kint3切比雪夫不等式的阶
nb_chev_filterint64cheb卷积层中输出特征的维度
nb_time_filterint64时间卷积层中输出特征的维度
time_stridesint1时间卷积层中在时间轴上滑动的距离
cheb_polynomialsnp.arrayadj_mx节点的邻接矩阵用于多项式
num_for_predictint12最后的输出数据在时间轴上的维度
len_inputint12等价于num_of_timesteps,输入数据的时间轴上的维度
num_of_verticesint307ifPEMS04,so顶点个数为307
  • 函数流程
def __init__
1. 使用nn.Module的方式进行初始化
2. 通过self.to(DEVICE)将整个模型加载到DEVICE上
3. 使用nn.ModuleList进行子模型初始化,里面包含2个ASTGCNmodeule
     第一个:ASTGCN_block(DEVICE, in_channels, K, nb_chev_filter, nb_time_filter,
                time_strides, cheb_polynomials, num_of_vertices, len_input)
     第二个:ASTGCN_block(DEVICE, nb_time_filter, K, nb_chev_filter, nb_time_filter, 
                1,cheb_polynomials, num_of_vertices, len_input//time_strides
4. 初始化一个2维卷积网络:self.final_conv = nn.Conv2d
      nn.Conv2d(int(len_input/time_strides), num_for_predict, kernel_size=(1, nb_time_filter))
def forward
5. 首先将nn.ModuleList中的子模块按照列表的顺序串联
6. 最后输入一个Conv2d(),问原文中不是全连接层的吗??   
class ASTGCN_submodule(nn.Module):

    def __init__(self, DEVICE, nb_block, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials, num_for_predict, len_input, num_of_vertices):
        super(ASTGCN_submodule, self).__init__()

        self.BlockList = nn.ModuleList([ASTGCN_block(DEVICE, in_channels, K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials, num_of_vertices, len_input)])

        self.BlockList.extend([ASTGCN_block(DEVICE, nb_time_filter, K, nb_chev_filter, nb_time_filter, 1,cheb_polynomials, num_of_vertices, len_input//time_strides)
                               for _ in range(nb_block-1)])  
     # nb_block= 2,所以self.BlockList中只有2个ASTGCN
# 
        self.final_conv = nn.Conv2d(int(len_input/time_strides), num_for_predict, kernel_size=(1, nb_time_filter))

        self.DEVICE = DEVICE

        self.to(DEVICE)

    def forward(self, x):
        '''
        :param x: (B, N_nodes, F_in, T_in)
        :return: (B, N_nodes, T_out)
        '''
        # 遍历每一个时空块
        for block in self.BlockList:
            x = block(x) 
        # x.shape=(B,F,N,T)--permute-->(B,T,N,F)--降维-->(B,T,N)--permute-->(B,N,T)
        output = self.final_conv(x.permute(0, 3, 1, 2))[:, :, :, -1].permute(0, 2, 1)
        return output  # (b,N,T) (32, 307, 12)
注释1. nn.ModuleList

nn.ModuleList中的通过nn.module的方法定义的子模型(或着说神经网络?)会自动初始化,但是没有像Sequential中定义好网络的先后顺序。所以子模型之间的搭配顺序要自己去定义。

注释2:2个ASTGCN的对比
含义ASTGCN1变量ASTGCN2变量
输入数据的特征的维度in_channels1nb_time_filter64
cheb层输出的特征nb_cheb_filter64nb_cheb_filter64
time层输出的特征nb_time_filter64nb_time_filter64
time层的滑动步伐time_strides11
  • 经过LayerNorm后变为(32,307,64,12)
注释3. final_conv
class Net(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.final_conv=nn.Conv2d(int(12/1), 
                  12, kernel_size=(1, 64))
    def forward(self, x):
        return self.final_conv(x)

测试:
在这里插入图片描述

  • 12
    点赞
  • 47
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值