1.由于传统神经网络(cnn,rnn)输入的数据必须是固定,无法处理输入数据不唯一,例分子的结构
2.gnn可以得到每一个点通过消息传递机制,经过训练之后,可以把每一个点的特征变得更好,可用于后续的分类或者回归
3.gnn训练的数据类型(是自己定义的)
from torch_geometric.datasets import KarateClub
data=KarateClub()
print(len(data)) # 调用__len__方法,返回图的个数
print(data.num_features) # 每一个点的特征数
print(data.num_classes) # 一共有多少个类别
data=data[0]
# Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34])
print(data.x) # x是初始时每个点的特征
print(data.y) # 每一个点的类别
print(data.edge_index) # 2*N的表示有边相连的
"""
[0,1,2,3,4]
[2,1,0,4,3] 表示0号点到2号点有边,1号点到1号点右边。。。。。。
"""
print(data.train_mask) # 标注True的表示打好标签的(只用它们去训练,得到loss)
点分类
from torch_geometric.datasets import KarateClub
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
class GCN(nn.Module):
def __init__(self):
super(GCN, self).__init__()
self.conv1=GCNConv(data.num_features,4)
self.conv2=GCNConv(4,4)
self.conv3=GCNConv(4,2)
self.classifier=nn.Linear(2,4)
def forward(self,x,edge_index): # x是指目前的每一个点的特征向量
x=self.conv1(x,edge_index)
x=x.tanh()
x=self.conv2(x,edge_index)
x=x.tanh()
x=self.conv3(x,edge_index)
x=x.tanh()
out=self.classifier(x)
return out
model=GCN()
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.01)
for epoch in range(100):
optimizer.zero_grad()
out=model(data.x,data.edge_index)
loss=criterion(out[data.train_mask],data.y[data.train_mask])
loss.backward()
optimizer.step()
print(loss.item())
4.构建自己的数据集
import torch
from torch_geometric.data import Data
x=torch.tensor(([[2,1],[5,6],[3,7],[12,0]])) # 一共四个点,每一个点的初始权重
y=torch.tensor([0,1,0,1]) # 每一个点的分类
edge_index=torch.tensor([[0,1,2,0,3], # 起始点
[1,0,1,3,2]],dtype=torch.long) # 终止点 ,顺序无所谓
data=Data(x=x,y=y,edge_index=edge_index)
print(data)
# Data(x=[4, 2], edge_index=[2, 5], y=[4])
5.可视化
(1)通过维度进行绘制
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def visualize(h, color):
#降维操作,将多维特征转化为x,y坐标(即两个特征)
z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())
plt.figure(figsize=(10,10))
plt.xticks([])
plt.yticks([])
plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
plt.show()
model = GCN()
out = model(data.x, data.edge_index)
visualize(out, color=data.y)
(2)通过图层次绘制
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx
def visualize_graph(G, color):
plt.figure(figsize=(7,7))
plt.xticks([])
plt.yticks([])
nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
node_color=color, cmap="Set2")
plt.show()
G = to_networkx(data, to_undirected=True) # data是一张图数据,to_undirected表示绘制无向边
visualize_graph(G, color=data.y)
6.图分类
加载数据集
import torch
from torch_geometric.datasets import TUDataset #分子数据集:https://chrsmrrs.github.io/datasets/
dataset = TUDataset(root='data/TUDataset', name='MUTAG')
print(f'Dataset: {dataset}:') # Dataset: MUTAG(188):
print(f'Number of graphs: {len(dataset)}') # 188 表示一共有188张图
print(f'Number of features: {dataset.num_features}') # 7 # 每一个点有7个特征
print(f'Number of classes: {dataset.num_classes}') # 2 表示一共有2个分类(对图而言)
data = dataset[0] # 取第一张图
print(data) # Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
print(f'Number of nodes: {data.num_nodes}') # 17
print(f'Number of edges: {data.num_edges}') # 38
构造DataLoader
from torch_geometric.loader import DataLoader
# 转换成dataloader格式,每一个batch含有64张图
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)
构造GCN
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
torch.manual_seed(12345)
self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, hidden_channels)
self.conv3 = GCNConv(hidden_channels, hidden_channels)
self.lin = Linear(hidden_channels, dataset.num_classes)
def forward(self, x, edge_index, batch):
# 1.对各节点进行编码
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. 平均操作,将一张图中所有点,取平均作为该图的特征,结果为batch*hidden_channels
x = global_mean_pool(x, batch)
# 3. 输出
x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x)
return x
训练和验证
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
for data in train_loader:
out = model(data.x, data.edge_index, data.batch)
loss = criterion(out, data.y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
def test(loader):
model.eval()
correct = 0
for data in loader:
out = model(data.x, data.edge_index, data.batch)
pred = out.argmax(dim=1)
correct += int((pred == data.y).sum())
return correct / len(loader.dataset)
for epoch in range(1, 171):
train()
train_acc = test(train_loader) # 此处训练集和测试集相同
print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')
7.GAT
GAT就是GNN+transformer,通过再添加一组可学习的参数,实现一个点的特征可以有权重地使用于其相邻的点的特征来更新自身的特征。
from torch_geometric.datasets import KarateClub
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GATConv
data=KarateClub()
class GAT(nn.Module):
def __init__(self):
super(GCN, self).__init__()
self.conv1=GATConv(data.num_features,4)
self.conv2=GATConv(4,4)
self.conv3=GATConv(4,2)
self.classifier=nn.Linear(2,4)
def forward(self,x,edge_index): # x是指目前的每一个点的特征向量
x=self.conv1(x,edge_index)
x=x.tanh()
x=self.conv2(x,edge_index)
x=x.tanh()
x=self.conv3(x,edge_index)
x=x.tanh()
out=self.classifier(x)
return out
model=GCN()
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.01)
for epoch in range(100):
optimizer.zero_grad()
out=model(data.x,data.edge_index)
loss=criterion(out[data.train_mask],data.y[data.train_mask])
loss.backward()
optimizer.step()
print(loss.item())