在GCN里介绍了处理cora数据集,以及返回的结果:
- features:论文的属性特征,维度2708 × 1433 2708 \times 14332708×1433,并且做了归一化,即每一篇论文属性值的和为1.
- labels:每一篇论文对应的分类编号:0-6
- adj:邻接矩阵,维度2708 × 2708 2708 \times 27082708×2708
- idx_train:0-139
- idx_val:200-499
- idx_test:500-1499
这一节介绍GAT模型:
GAT模型
model:
class GAT(nn.Module):
def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
"""Dense version of GAT."""
super(GAT, self).__init__()
self.dropout = dropout
self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
for i, attention in enumerate(self.attentions):
self.add_module('attention_{}'.format(i), attention)
self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, alpha=alpha, concat=False) # 第二层(最后一层)的attention layer
def forward(self, x, adj):
x = F.dropout(x, self.dropout, training=self.training)
x = torch.cat([att(x, adj) for att in self.attentions], dim=1) # 将每层attention拼接
x = F.dropout(x, self.dropout, training=self.training)
x = F.elu(self.out_att(x, adj)) # 第二层的attention layer
return F.log_softmax(x, dim=1)
layers:
class GraphAttentionLayer(nn.Module):
"""
Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
"""
def __init__(self, in_features, out_features, dropout, alpha, concat=True):
super(GraphAttentionLayer, self).__init__()
self.dropout = dropout
self.in_features = in_features
self.out_features = out_features
self.alpha = alpha
self.concat = concat
self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
nn.init.xavier_uniform_(self.W.data, gain=1.414)
self.a = nn.Parameter(torch.empty(size=(2*out_features, 1))) # concat(V,NeigV)
nn.init.xavier_uniform_(self.a.data, gain=1.414)
self.leakyrelu = nn.LeakyReLU(self.alpha)
def forward(self, h, adj):
Wh = torch.mm(h, self.W) # h.shape: (N, in_features), Wh.shape: (N, out_features)
a_input = self._prepare_attentional_mechanism_input(Wh) # 每一个节点和所有节点,特征。(Vall, Vall, feature)
e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2)) # a_input.shape=(2708,2708,16) self.a.shape=(16,l) numpy.matmul(a_input,self.a) shape=(2708, 2708, 1) squeeze表示去掉最后一个维度, (2708,2708)
# 之前计算的是一个节点和所有节点的attention,其实需要的是连接的节点的attention系数
zero_vec = -9e15*torch.ones_like(e)
attention = torch.where(adj > 0, e, zero_vec) # 将邻接矩阵中小于0的变成负无穷
attention = F.softmax(attention, dim=1) # 按行求softmax。 sum(axis=1) === 1
attention = F.dropout(attention, self.dropout, training=self.training)
h_prime = torch.matmul(attention, Wh) # 聚合邻居函数
if self.concat:
return F.elu(h_prime) # elu-激活函数
else:
return h_prime
def _prepare_attentional_mechanism_input(self, Wh):
N = Wh.size()[0] # number of nodes
# Below, two matrices are created that contain embeddings in their rows in different orders.
# (e stands for embedding)
# These are the rows of the first matrix (Wh_repeated_in_chunks):
# e1, e1, ..., e1, e2, e2, ..., e2, ..., eN, eN, ..., eN
# '-------------' -> N times '-------------' -> N times '-------------' -> N times
#
# These are the rows of the second matrix (Wh_repeated_alternating):
# e1, e2, ..., eN, e1, e2, ..., eN, ..., e1, e2, ..., eN
# '----------------------------------------------------' -> N times
#
Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0) # 复制
Wh_repeated_alternating = Wh.repeat(N, 1)
# Wh_repeated_in_chunks.shape == Wh_repeated_alternating.shape == (N * N, out_features)
# The all_combination_matrix, created below, will look like this (|| denotes concatenation):
# e1 || e1
# e1 || e2
# e1 || e3
# ...
# e1 || eN
# e2 || e1
# e2 || e2
# e2 || e3
# ...
# e2 || eN
# ...
# eN || e1
# eN || e2
# eN || e3
# ...
# eN || eN
all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
# all_combinations_matrix.shape == (N * N, 2 * out_features)
return all_combinations_matrix.view(N, N, 2 * self.out_features)
def __repr__(self):
return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
初始化模型
model = GAT(nfeat=1433,
nhid=8,
nclass=7,
dropout=0.6,
nheads=8,
alpha=0.2)
构建attention:
self.dropout = 0.6
self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
for i, attention in enumerate(self.attentions):
self.add_module('attention_{}'.format(i), attention)
self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, alpha=alpha, concat=False) # 第二层(最后一层)的attention layer
attentions和out_att
首先构建attentions层,主要包括8个GraphAttentionLayer
,每一个GraphAttentionLayer
如下:
def __init__(self, in_features, out_features, dropout, alpha, concat=True):
super(GraphAttentionLayer, self).__init__()
self.dropout = 0.6
self.in_features = 1433
self.out_features = 8
self.alpha = 0.2
self.concat = True
self.W = nn.Parameter(torch.empty(size=(1433, 8)))
nn.init.xavier_uniform_(self.W.data, gain=1.414) # 初始化W
self.a = nn.Parameter(torch.empty(size=(2*out_features, 1))) # concat(V,NeigV)
nn.init.xavier_uniform_(self.a.data, gain=1.414) # 初始化a
self.leakyrelu = nn.LeakyReLU(0.2)
参数W的维度是
W
1433
×
8
W_{1433 \times 8}
W1433×8
参数a的维度是
a
16
×
1
a_{16 \times 1}
a16×1
out_att
而out_att与attention相似,区别是out_att只有一个GraphAttentionLayer
,而且参数也有所不同:
def __init__(self, in_features, out_features, dropout, alpha, concat=True):
super(GraphAttentionLayer, self).__init__()
self.dropout = 0.6
self.in_features = 64
self.out_features = 7
self.alpha = 0.2
self.concat = False
self.W = nn.Parameter(torch.empty(size=(64, 7)))
nn.init.xavier_uniform_(self.W.data, gain=1.414) # 初始化W
self.a = nn.Parameter(torch.empty(size=(2*out_features, 1))) # concat(V,NeigV)
nn.init.xavier_uniform_(self.a.data, gain=1.414) # 初始化a
self.leakyrelu = nn.LeakyReLU(0.2)
参数W的维度是
W
64
×
7
W_{64 \times 7}
W64×7
参数a的维度是
a
14
×
1
a_{14 \times 1}
a14×1
forward执行模型
- 首先执行model:
def forward(self, x, adj):
x = F.dropout(x, self.dropout, training=self.training)
x = torch.cat([att(x, adj) for att in self.attentions], dim=1) # 将每层attention拼接
x = F.dropout(x, self.dropout, training=self.training)
x = F.elu(self.out_att(x, adj)) # 第二层的attention layer
return F.log_softmax(x, dim=1)
- 执行
F.dropout
,将输入数据的特征进行dropout=0.6
, - 遍历
self.attentions
,得到每个att
,执行GraphAttentionLayer
中的forward
方法,att(x, adj)
将传入到forward
中,参数是数据特征x
和邻接矩阵adj
- 在
GraphAttentionLayer
中的forward
:
def forward(self, h, adj):
Wh = torch.mm(h, self.W) # h.shape: (N, in_features), Wh.shape: (N, out_features)
a_input = self._prepare_attentional_mechanism_input(Wh) # 每一个节点和所有节点,特征。(Vall, Vall, feature)
e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2)) # a_input.shape=(2708,2708,16) self.a.shape=(16,l) numpy.matmul(a_input,self.a) shape=(2708, 2708, 1) squeeze表示去掉最后一个维度, (2708,2708)
# 之前计算的是一个节点和所有节点的attention,其实需要的是连接的节点的attention系数
zero_vec = -9e15*torch.ones_like(e)
attention = torch.where(adj > 0, e, zero_vec) # 将邻接矩阵中小于0的变成负无穷
attention = F.softmax(attention, dim=1) # 按行求softmax。 sum(axis=1) === 1
attention = F.dropout(attention, self.dropout, training=self.training)
h_prime = torch.matmul(attention, Wh) # 聚合邻居函数
if self.concat:
return F.elu(h_prime) # elu-激活函数
else:
return h_prime
将数据特征h
与第一个attention的权重w
相乘,得到结果
w
h
=
x
2708
×
1433
×
W
1433
×
8
w_h=x_{2708 \times 1433} \times W_{1433 \times 8}
wh=x2708×1433×W1433×8,
w
h
w_h
wh的维度是
2708
×
8
2708 \times 8
2708×8,然后执行self._prepare_attentional_mechanism_input(Wh)
5. 执行self._prepare_attentional_mechanism_input(Wh)
:
def _prepare_attentional_mechanism_input(self, Wh):
N = Wh.size()[0] # number of nodes
Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0) # 复制 维度:[2708*2708, 8]
Wh_repeated_alternating = Wh.repeat(N, 1)
all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
return all_combinations_matrix.view(N, N, 2 * self.out_features)
假如Wh[0]
的内容如下:
Wh[0]
tensor([-0.0118, -0.0033, -0.0051, 0.0151, -0.0151, 0.0186, -0.0097, 0.0387],
grad_fn=<SelectBackward>)
那么经过Wh_repeated_in_chunks=Wh.repeat_interleave(N, dim=0)
后,Wh_repeated_in_chunks
的维度变为
2708
∗
2708
×
8
2708*2708 \times 8
2708∗2708×8,且Wh_repeated_in_chunks[0]
到Wh_repeated_in_chunks[2707]
的数据与Wh[0]
一致,
Wh_repeated_in_chunks[2708]
到Wh_repeated_in_chunks[2707+2708]
的数据与Wh[1]
一致,以此类推。
经过Wh_repeated_alternating = Wh.repeat(N, 1)
后,Wh_repeated_alternating
维度变为
2708
∗
2708
×
8
2708*2708 \times 8
2708∗2708×8,且Wh_repeated_alternating[0]
到Wh_repeated_alternating[2707]
的数据与Wh[0]
到Wh[2707]
一致,形式如下:
# e1, e1, ..., e1, e2, e2, ..., e2, ..., eN, eN, ..., eN
# '-------------' -> N times '-------------' -> N times '-------------' -> N times
Wh_repeated_alternating[2708]
到Wh_repeated_alternating[2707+2708]
的数据与Wh[2708]
到Wh[2707+2708]
一致,形式如下:
# These are the rows of the second matrix (Wh_repeated_alternating):
# e1, e2, ..., eN, e1, e2, ..., eN, ..., e1, e2, ..., eN
# '----------------------------------------------------' -> N times
all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
的作用是将上面的Wh_repeated_alternating
与Wh_repeated_alternating
拼接起来,维度为
[
2708
∗
2708
,
16
]
[2708*2708,16]
[2708∗2708,16],形式如下:
# e1 || e1
# e1 || e2
# e1 || e3
# ...
# e1 || eN
# e2 || e1
# e2 || e2
# e2 || e3
# ...
# e2 || eN
# ...
# eN || e1
# eN || e2
# eN || e3
# ...
# eN || eN
返回结果a_input
的维度被改为
[
2708
,
2708
,
16
]
[2708,2708,16]
[2708,2708,16],a_input
格式如下,a_input[0]
表示第0个数据与其他数据的特征拼接;a_input[1]
表示第1个数据与其他数据的特征拼接;
6. e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
计算attention,a_input
的维度是
[
2708
,
2708
,
16
]
[2708, 2708, 16]
[2708,2708,16],self.a
的维度是
[
16
,
1
]
[16, 1]
[16,1],相乘得到维度
[
2708
,
2708
,
1
]
[2708, 2708, 1]
[2708,2708,1],这个表示每一个节点与其他所有节点的attention
的值,squeeze表示去掉最后一个维度, 也就是维度变为
[
2708
,
2708
]
[2708,2708]
[2708,2708]
7. zero_vec = -9e15*torch.ones_like(e)
, 之前计算的是一个节点和所有节点的attention,其实需要的是连接的节点的attention系数,所以生成与e同样的结构的矩阵zero_vec
8. attention = torch.where(adj > 0, e, zero_vec)
将邻接矩阵中小于0的变成负无穷,形成与邻接矩阵shape相同的attention矩阵,每个值表示该节点与其他节点的attention值
9. attention = F.softmax(attention, dim=1)
,对attention矩阵的每一行求softmax,求得关联度最大的概率节点
10. attention = F.dropout(attention, self.dropout, training=self.training)
对attention矩阵进行dropout
11. h_prime = torch.matmul(attention, Wh)
将最终得到的attention矩阵与当前attention的权重矩阵相乘
12. 将结果h_prime
进行激活函数elu
,h_prime
的维度是
2708
×
8
2708 \times 8
2708×8
13. 这样就完成了一次attention,总共有8次attention
14. 得到8个h_prime
,将这8个h_prime
拼接起来,得到x
的shape是[2708, 64]