pytorch_geometric安装及工具包使用

weixin_52568655

已于 2023-06-22 15:56:05 修改

阅读量464

点赞数

文章标签： pytorch 深度学习 python

于 2023-06-11 22:52:49 首次发布

本文链接：https://blog.csdn.net/weixin_52568655/article/details/131152782

版权

github地址：GitHub - pyg-team/pytorch_geometric: Graph Neural Network Library for PyTorch

一、安装

下载

torch-scatter: Accelerated and efficient sparse reductions
torch-sparse: SparseTensor support
torch-cluster: Graph clustering routines
torch-spline-conv: SplineConv support

下载wheels文件：data.pyg.org/whl/

使用pip install 安装

安装后，使用pip install torch-geometric 安装

二、使用

1、任务1：KarateClub数据集，共34名会员，会员之间有社交关系；其中两个人有矛盾，要预测会员要站谁的队。

数据集说明：整个数据集是一个图，34个会员是34个点，会员之间的关系是边，共156个边；

目标：要对34个点进行分类

1）查询数据集

from torch_geometric.datasets import KarateClub

dataset = KarateClub()
data = dataset[0]  # Get the first graph object.

print(data)

Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34])

data.x：带形状的节点特征矩阵[num_nodes 样本个数, num_node_features每个样本的特征维度]
data.edge_index：具有形状和类型的COO格式的图形连接[2, num_edges边的个数]torch.long
data.edge_attr：带形状的边缘特征矩阵[num_edges, num_edge_features]
data.y：要训练的目标（可以具有任意形状），例如，形状的节点级目标或形状的图形级目标[num_nodes, *][1, *]
data.pos：具有形状的节点位置矩阵[num_nodes, num_dimensions]
train_mask 有标签的点进行计算

edge_index：表示图的连接关系（start,end两个序列） - node features：每个点的特征 - node labels：每个点的标签 - train_mask：有的节点木有标签（用来表示哪些节点要计算损失）

edge_index

打印edge_index,结果如下

第一行和第二行分别是每个连接的点，两个个数一样

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,
          3,  3,  3,  3,  3,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,
          7,  7,  8,  8,  8,  8,  8,  9,  9, 10, 10, 10, 11, 12, 12, 13, 13, 13,
         13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21,
         21, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27,
         27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31,
         31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
         33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33],
        [ 1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 17, 19, 21, 31,  0,  2,
          3,  7, 13, 17, 19, 21, 30,  0,  1,  3,  7,  8,  9, 13, 27, 28, 32,  0,
          1,  2,  7, 12, 13,  0,  6, 10,  0,  6, 10, 16,  0,  4,  5, 16,  0,  1,
          2,  3,  0,  2, 30, 32, 33,  2, 33,  0,  4,  5,  0,  0,  3,  0,  1,  2,
          3, 33, 32, 33, 32, 33,  5,  6,  0,  1, 32, 33,  0,  1, 33, 32, 33,  0,
          1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33,  2, 23,
         24, 33,  2, 31, 33, 23, 26, 32, 33,  1,  8, 32, 33,  0, 24, 25, 28, 32,
         33,  2,  8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33,  8,  9, 13, 14, 15,
         18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32]])

edge_index = data.edge_index
print(edge_index.t())

转置之后的结果：

tensor([[ 0,  1],
        [ 0,  2],
        [ 0,  3],
        [ 0,  4],
        [ 0,  5],
        [ 0,  6],
        [ 0,  7],
        [ 0,  8],
        [ 0, 10],
        [ 0, 11],
        [ 0, 12],
        [ 0, 13],
        [ 0, 17]]

edge_index：表示图的连接关系（start,end两个序列）
node features：每个点的特征
node labels：每个点的标签
train_mask：有的节点木有标签（用来表示哪些节点要计算损失）

2）利用GCN进行分类

import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv

#定义类GCN
#点和边没有变化，变化的是点的特征维度
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(1234)
        #定义三层GCN，第一层输入34，输出4；第二层输入4，输出4；第三层输入4，输出2
        self.conv1 = GCNConv(dataset.num_features, 4) # 只需定义好输入特征和输出特征即可
        self.conv2 = GCNConv(4, 4)
        self.conv3 = GCNConv(4, 2)
        #每个点得到两维特征，进行4分类
        self.classifier = Linear(2, dataset.num_classes)

    def forward(self, x, edge_index):
        #输入特征x和邻居矩阵edge_index
        h = self.conv1(x, edge_index) # 输入特征与邻接矩阵（注意格式，上面那种）
        #使用激活函数
        h = h.tanh()
        # 得到新的特征h，邻接矩阵没变
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()  
        
        # 分类层
        out = self.classifier(h)

        return out, h

model = GCN()
print(model)

训练模型

import time

model = GCN()
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Define optimizer.

def train(data):
    optimizer.zero_grad()  
    out, h = model(data.x, data.edge_index) #h是两维向量，主要是为了咱们画个图 
    #计算损失时仅使用tarin_mask为true的点计算,没有标签的不影响损失
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # semi-supervised
    #反向传播  
    loss.backward()
    #优化器优化  
    optimizer.step()  
    return loss, h

for epoch in range(401):
    loss, h = train(data)
    if epoch % 10 == 0:
        #画图
        visualize_embedding(h, color=data.y, epoch=epoch, loss=loss)
        time.sleep(0.3)

任务2：有多个图，针对图进行2分类

import torch
from torch_geometric.datasets import TUDataset #分子数据集：https://chrsmrrs.github.io/datasets/

dataset = TUDataset(root='data/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1.对各节点进行编码
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. 平均操作
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. 输出
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        print(out.shape)
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
    model.eval()
    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)  
        ## 获取最高的概率分类
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 171):
    train()
    train_acc = test(train_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')

总结一下：

1、对点进行分类，如果有标注信息，loss仅需要计算标注的点的损失。即半监督学习。训练时，保证训练的模型是和分类一样的

2、对图进行分类，其实还是对各个节点进行特征编码，只不过现在多了一步聚合操作，把各个节点特征汇总成全局特征就相当于得到了整个图的编码：torch_geometric.nn.global_mean_pool