github地址:GitHub - pyg-team/pytorch_geometric: Graph Neural Network Library for PyTorch
一、安装
下载
- torch-scatter: Accelerated and efficient sparse reductions
- torch-sparse: SparseTensor support
- torch-cluster: Graph clustering routines
- torch-spline-conv: SplineConv support
下载wheels文件:data.pyg.org/whl/
使用pip install 安装
安装后,使用pip install torch-geometric 安装
二、使用
1、任务1:KarateClub数据集,共34名会员,会员之间有社交关系;其中两个人有矛盾,要预测会员要站谁的队。
数据集说明:整个数据集是一个图,34个会员是34个点,会员之间的关系是边,共156个边;
目标:要对34个点进行分类
1)查询数据集
from torch_geometric.datasets import KarateClub
dataset = KarateClub()
data = dataset[0] # Get the first graph object.
print(data)
Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34])
-
data.x
:带形状的节点特征矩阵[num_nodes 样本个数, num_node_features每个样本的特征维度]
-
data.edge_index
:具有形状和类型的COO格式的图形连接[2, num_edges边的个数]
torch.long
-
data.edge_attr
:带形状的边缘特征矩阵[num_edges, num_edge_features]
-
data.y
:要训练的目标(可以具有任意形状),例如,形状的节点级目标或形状的图形级目标[num_nodes, *]
[1, *]
-
data.pos
:具有形状的节点位置矩阵[num_nodes, num_dimensions]
-
train_mask 有标签的点进行计算
edge_index:表示图的连接关系(start,end两个序列) - node features:每个点的特征 - node labels:每个点的标签 - train_mask:有的节点木有标签(用来表示哪些节点要计算损失)
edge_index
打印edge_index,结果如下
第一行和第二行分别是每个连接的点,两个个数一样
tensor([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 11, 12, 12, 13, 13, 13, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27, 27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33], [ 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32]])
edge_index = data.edge_index
print(edge_index.t())
转置之后的结果:
tensor([[ 0, 1], [ 0, 2], [ 0, 3], [ 0, 4], [ 0, 5], [ 0, 6], [ 0, 7], [ 0, 8], [ 0, 10], [ 0, 11], [ 0, 12], [ 0, 13], [ 0, 17]]
- edge_index:表示图的连接关系(start,end两个序列)
- node features:每个点的特征
- node labels:每个点的标签
- train_mask:有的节点木有标签(用来表示哪些节点要计算损失)
2)利用GCN进行分类
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv
#定义类GCN
#点和边没有变化,变化的是点的特征维度
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
torch.manual_seed(1234)
#定义三层GCN,第一层输入34,输出4;第二层输入4,输出4;第三层输入4,输出2
self.conv1 = GCNConv(dataset.num_features, 4) # 只需定义好输入特征和输出特征即可
self.conv2 = GCNConv(4, 4)
self.conv3 = GCNConv(4, 2)
#每个点得到两维特征,进行4分类
self.classifier = Linear(2, dataset.num_classes)
def forward(self, x, edge_index):
#输入特征x和邻居矩阵edge_index
h = self.conv1(x, edge_index) # 输入特征与邻接矩阵(注意格式,上面那种)
#使用激活函数
h = h.tanh()
# 得到新的特征h,邻接矩阵没变
h = self.conv2(h, edge_index)
h = h.tanh()
h = self.conv3(h, edge_index)
h = h.tanh()
# 分类层
out = self.classifier(h)
return out, h
model = GCN()
print(model)
训练模型
import time
model = GCN()
criterion = torch.nn.CrossEntropyLoss() # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Define optimizer.
def train(data):
optimizer.zero_grad()
out, h = model(data.x, data.edge_index) #h是两维向量,主要是为了咱们画个图
#计算损失时仅使用tarin_mask为true的点计算,没有标签的不影响损失
loss = criterion(out[data.train_mask], data.y[data.train_mask]) # semi-supervised
#反向传播
loss.backward()
#优化器优化
optimizer.step()
return loss, h
for epoch in range(401):
loss, h = train(data)
if epoch % 10 == 0:
#画图
visualize_embedding(h, color=data.y, epoch=epoch, loss=loss)
time.sleep(0.3)
任务2:有多个图,针对图进行2分类
import torch
from torch_geometric.datasets import TUDataset #分子数据集:https://chrsmrrs.github.io/datasets/
dataset = TUDataset(root='data/TUDataset', name='MUTAG')
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
data = dataset[0] # Get the first graph object.
print()
print(data)
print('=============================================================')
# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')
from torch_geometric.loader import DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
for step, data in enumerate(train_loader):
print(f'Step {step + 1}:')
print('=======')
print(f'Number of graphs in the current batch: {data.num_graphs}')
print(data)
print()
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
torch.manual_seed(12345)
self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, hidden_channels)
self.conv3 = GCNConv(hidden_channels, hidden_channels)
self.lin = Linear(hidden_channels, dataset.num_classes)
def forward(self, x, edge_index, batch):
# 1.对各节点进行编码
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. 平均操作
x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
# 3. 输出
x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x)
return x
model = GCN(hidden_channels=64)
print(model)
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
for data in train_loader: # Iterate in batches over the training dataset.
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass.
print(out.shape)
loss = criterion(out, data.y) # Compute the loss.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test(loader):
model.eval()
correct = 0
for data in loader: # Iterate in batches over the training/test dataset.
out = model(data.x, data.edge_index, data.batch)
## 获取最高的概率分类
pred = out.argmax(dim=1) # Use the class with highest probability.
correct += int((pred == data.y).sum()) # Check against ground-truth labels.
return correct / len(loader.dataset) # Derive ratio of correct predictions.
for epoch in range(1, 171):
train()
train_acc = test(train_loader)
print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')
总结一下:
1、对点进行分类,如果有标注信息,loss仅需要计算标注的点的损失。即半监督学习。训练时,保证训练的模型是和分类一样的
2、对图进行分类,其实还是对各个节点进行特征编码,只不过现在多了一步聚合操作,把各个节点特征汇总成全局特征就相当于得到了整个图的编码:torch_geometric.nn.global_mean_pool