本文主要参考DataWhale图神经网络组队学习
三个数据集的统计信息如下:
数据集 | Cora | CiteSeer | PubMed |
---|---|---|---|
节点数 | 2708 | 3327 | 19717 |
边数 | 5278 | 4552 | 44324 |
训练节点数 | 140 | 120 | 60 |
验证节点数 | 500 | 500 | 500 |
测试节点数 | 1000 | 1000 | 1000 |
节点类别数 | 7 | 6 | 3 |
特征维度 | 1433 | 3703 | 500 |
边密度 | 0.0014 | 0.0008 | 0.0002 |
边密度计算公式:
p
=
2
m
n
2
p = \frac{2 m}{n^{2}}
p=n22m
其中,
m
m
m 表示边数,
n
n
n表示节点数。
本文仅采用Cora数据集进行展示。
实践问题一:尝试使用PyG中的不同的网络层去代替GCNConv,以及不同的层数和不同的out_channels ,来实现节点分类任务。
加载数据集
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
dataset = Planetoid(root='dataset/Cora', name='Cora',
transform=NormalizeFeatures())
搭建三层GAT [256, 128, 64],外加一个线性层(64---->7)
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, Sequential
from torch.nn import Linear
from torch.nn import ReLU
class GAT(torch.nn.Module):
def __init__(self, num_features, hidden_channels_list, num_classes):
super(GAT, self).__init__()
hns = [num_features] + hidden_channels_list
conv_list = []
for idx in range(len(hidden_channels_list)):
conv_list.append((GATConv(hns[idx], hns[idx+1]), 'x, edge_index -> x'))
conv_list.append(ReLU(inplace=True),)
self.convseq = Sequential('x, edge_index', conv_list)
self.linear = Linear(hidden_channels_list[-1], num_classes)
def forward(self, x, edge_index):
x = self.convseq(x, edge_index)
x = F.dropout(x, p=0.5, training=self.training)
x = self.linear(x)
return F.log_softmax(x, dim=1)
初始化模型并利用GPU加速
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
data = dataset[0].to(device)
hidden_channels_list = [256, 128, 64]
model = GAT(dataset.num_features, hidden_channels_list, dataset.num_classes)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
模型训练
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
模型测试
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
test_acc = int(test_correct) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
可视化
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def visualize(h, color):
z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())
plt.figure(figsize=(10,10))
plt.xticks([])
plt.yticks([])
plt.scatter(z[:, 0], z[:, 1], s=70, c=color.cpu().detach().numpy()
, cmap='Paired'
)
plt.show()
out = model(data.x, data.edge_index)
visualize(out[data.test_mask], data.y[data.test_mask])
搭建四层GAT,[256, 128, 64, 32],外加一个线性层(32----->7) (方法同上)
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, Sequential
from torch.nn import Linear
from torch.nn import ReLU
class GAT(torch.nn.Module):
def __init__(self, num_features, hidden_channels_list, num_classes):
super(GAT, self).__init__()
hns = [num_features] + hidden_channels_list
conv_list = []
for idx in range(len(hidden_channels_list)):
conv_list.append((GATConv(hns[idx], hns[idx+1]), 'x, edge_index -> x'))
conv_list.append(ReLU(inplace=True),)
self.convseq = Sequential('x, edge_index', conv_list)
self.linear = Linear(hidden_channels_list[-1], num_classes)
def forward(self, x, edge_index):
x = self.convseq(x, edge_index)
x = F.dropout(x, p=0.5, training=self.training)
x = self.linear(x)
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
data = dataset[0].to(device)
hidden_channels_list = [256, 128, 64, 32]
model = GAT(dataset.num_features, hidden_channels_list, dataset.num_classes)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
test_acc = int(test_correct) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
可视化
out = model(data.x, data.edge_index)
visualize(out[data.test_mask], data.y[data.test_mask])
实践问题二:在边预测任务中,尝试用torch_geometric.nn.Sequential容器构造图神经网络。
利用Sequential容器和GCN来构造神经网络
import torch
from torch_geometric.nn import GCNConv, Sequential
import torch.nn.functional as F
from torch.nn import Linear
from torch.nn import ReLU
class Net(torch.nn.Module):
def __init__(self, num_features, hidden_channels_list):
super(Net, self).__init__()
hns = [num_features] + hidden_channels_list
conv_list = []
for idx in range(len(hidden_channels_list)):
conv_list.append((GCNConv(hns[idx], hns[idx+1]), 'x, edge_index -> x'))
conv_list.append(ReLU(inplace=True),)
self.convseq = Sequential('x, edge_index', conv_list)
def encode(self, x, edge_index):
x = F.dropout(x, p=0.6, training=self.training)
x = self.convseq(x, edge_index)
return x
def decode(self, z, pos_edge_index, neg_edge_index):
edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
def decode_all(self, z):
prob_adj = z @ z.t()
return (prob_adj > 0).nonzero(as_tuple=False).t()
def get_link_labels(pos_edge_index, neg_edge_index):
num_links = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(num_links, dtype=torch.float)
link_labels[:pos_edge_index.size(1)] = 1.
return link_labels
def train(data, model, optimizer):
model.train()
neg_edge_index = negative_sampling( edge_index=data.train_pos_edge_index,
num_nodes=data.num_nodes,
num_neg_samples=data.train_pos_edge_index.size(1))
optimizer.zero_grad()
z = model.encode(data.x, data.train_pos_edge_index)
link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index)
link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index).to(data.x.device)
loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
loss.backward()
optimizer.step()
return loss // An highlighted block
var foo = 'bar';
from sklearn.metrics import roc_auc_score
@torch.no_grad()
def test(data, model):
model.eval()
z = model.encode(data.x, data.train_pos_edge_index)
results = []
for prefix in ['val', 'test']:
pos_edge_index = data[f'{prefix}_pos_edge_index']
neg_edge_index = data[f'{prefix}_neg_edge_index']
link_logits = model.decode(z, pos_edge_index, neg_edge_index)
link_probs = link_logits.sigmoid()
link_labels = get_link_labels(pos_edge_index, neg_edge_index)
results.append(roc_auc_score(link_labels.cpu(), link_probs.cpu()))
return results
def main():
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = Planetoid(root='dataset/Cora', name='Cora',
transform=NormalizeFeatures())
data = dataset[0]
ground_truth_edge_index = data.edge_index.to(device)
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
data = data.to(device)
model = Net(dataset.num_features, [128,64]).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
best_val_auc = test_auc = 0
for epoch in range(1, 101):
loss = train(data, model, optimizer)
val_auc, tmp_test_auc = test(data, model)
if val_auc > best_val_auc:
best_val_auc = val_auc
test_auc = tmp_test_auc
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
f'Test: {test_auc:.4f}')
z = model.encode(data.x, data.train_pos_edge_index)
final_edge_index = model.decode_all(z)
if __name__ == "__main__":
main()
利用Sequential容器和GAT来构造神经网络
import torch
from torch_geometric.nn import GATConv, Sequential
import torch.nn.functional as F
from torch.nn import Linear
from torch.nn import ReLU
class Net(torch.nn.Module):
def __init__(self, num_features, hidden_channels_list):
super(Net, self).__init__()
# torch.manual_seed(12345)
hns = [num_features] + hidden_channels_list
conv_list = []
for idx in range(len(hidden_channels_list)):
conv_list.append((GATConv(hns[idx], hns[idx+1]), 'x, edge_index -> x'))
conv_list.append(ReLU(inplace=True),)
self.convseq = Sequential('x, edge_index', conv_list)
def encode(self, x, edge_index):
x = F.dropout(x, p=0.6, training=self.training)
x = self.convseq(x, edge_index)
return x
def decode(self, z, pos_edge_index, neg_edge_index):
edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
def decode_all(self, z):
prob_adj = z @ z.t()
return (prob_adj > 0).nonzero(as_tuple=False).t()
def get_link_labels(pos_edge_index, neg_edge_index):
num_links = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(num_links, dtype=torch.float)
link_labels[:pos_edge_index.size(1)] = 1.
return link_labels
def train(data, model, optimizer):
model.train()
neg_edge_index = negative_sampling( edge_index=data.train_pos_edge_index,
num_nodes=data.num_nodes,
num_neg_samples=data.train_pos_edge_index.size(1))
optimizer.zero_grad()
z = model.encode(data.x, data.train_pos_edge_index)
link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index)
link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index).to(data.x.device)
loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
loss.backward()
optimizer.step()
return loss
from sklearn.metrics import roc_auc_score
@torch.no_grad()
def test(data, model):
model.eval()
z = model.encode(data.x, data.train_pos_edge_index)
results = []
for prefix in ['val', 'test']:
pos_edge_index = data[f'{prefix}_pos_edge_index']
neg_edge_index = data[f'{prefix}_neg_edge_index']
link_logits = model.decode(z, pos_edge_index, neg_edge_index)
link_probs = link_logits.sigmoid()
link_labels = get_link_labels(pos_edge_index, neg_edge_index)
results.append(roc_auc_score(link_labels.cpu(), link_probs.cpu()))
return results
def main():
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = Planetoid(root='dataset/Cora', name='Cora',
transform=NormalizeFeatures())
data = dataset[0]
ground_truth_edge_index = data.edge_index.to(device)
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
data = data.to(device)
model = Net(dataset.num_features, [128,64]).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
best_val_auc = test_auc = 0
for epoch in range(1, 101):
loss = train(data, model, optimizer)
val_auc, tmp_test_auc = test(data, model)
if val_auc > best_val_auc:
best_val_auc = val_auc
test_auc = tmp_test_auc
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
f'Test: {test_auc:.4f}')
z = model.encode(data.x, data.train_pos_edge_index)
final_edge_index = model.decode_all(z)
if __name__ == "__main__":
main()
思考问题三:如下方代码所示,我们以data.train_pos_edge_index为实际参数来进行训练集负样本采样,但这样采样得到的负样本可能包含一些验证集的正样本与测试集的正样本,即可能将真实的正样本标记为负样本,由此会产生冲突。但我们还是这么做,这是为什么?
neg_edge_index = negative_sampling( edge_index=data.train_pos_edge_index, num_nodes=data.num_nodes, num_neg_samples=data.train_pos_edge_index.size(1))
就Cora数据集举例,共有2708个节点,假设每个节点都与其他所有节点相连(包括本身),那么共存在2708*2708=7333264条边,在train_pos_edge_index中共存在8976条边,所以负样本是从其余的七百多万条边中去采样,因此即使采样到的负样本是验证集或测试集中的正样本,那么这种巧合的事情发生概率非常低,不会对模型的整体泛化能力有特别大的影响。