构建网络模型
方式一:自己制作数据集的类(SAGEConv_model.py)
和图像中的卷积和池化操作非常相似,最后再全连接输出
(这个是直接写的py文件,方式二里我是在jupyter notebook中写的,所以传参的方式会不太一样)
1、导包
2、参数定义:root和result_df_path,对应的文件夹分别是“data(local)”、“comment_df_embedding_18063.pkl”
3、数据加载:(1)首先读取result_df,使其变成dataframe格式的文件;(2)调用LocalCommentBinaryDataset将数据处理成网络可用的输入dataset = LocalCommentBinaryDataset(root=args.root, result_df=result_df);(3)分批处理
4、构建网络
5、定义训练、验证函数
6、训练
# -*- coding: utf-8 -*-
"""
@File: SAGEConv_model.py
@Time : 2024/7/2 10:42
@dec: 构建简单的图网络模型,训练Reddit数据集,看数据集的效果
"""
import torch
from torch_geometric.data import DataLoader
from local_graph_create import LocalCommentBinaryDataset
from config import (LOCAL_EMBEDDINGS_OUTPUT_PATH,LOCAL_GRAPH_OUTPUT_PATH)
import argparse
import pandas as pd
from torch_geometric.nn import TopKPooling,SAGEConv
from torch_geometric.nn import global_mean_pool as gap,global_max_pool as gmp
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
# 定义命令行参数解析器
parser = argparse.ArgumentParser(description="Load local graph data for training.")
parser.add_argument('--root', type=str, default=LOCAL_GRAPH_OUTPUT_PATH, help='Root directory for dataset storage.')
parser.add_argument('--result_df_path', type=str, default=LOCAL_EMBEDDINGS_OUTPUT_PATH, help='Path to the result_df CSV file.')
args = parser.parse_args()
result_df = pd.read_pickle(args.result_df_path)
# 加载数据集
dataset = LocalCommentBinaryDataset(root=args.root, result_df=result_df)
# 创建DataLoader
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)
# ---------------------------------------------构建网络-----------------------------------------------------------
embed_dim = 768
class Net(torch.nn.Module):
def __init__(self):
super(Net,self).__init__()
self.conv1 = SAGEConv(embed_dim,128)
self.pool1 = TopKPooling(128, ratio=0.8) #下采样,把x变小
self.conv2 = SAGEConv(128,128)
self.pool2 = TopKPooling(128, ratio=0.8)
self.conv3 = SAGEConv(128,128)
self.pool3 = TopKPooling(128, ratio=0.8)
self.lin1 = torch.nn.Linear(128,128)
self.lin2 = torch.nn.Linear(128,64)
self.lin3 = torch.nn.Linear(64,1) # 最终得到0和1之间的概率值
self.bn1 = torch.nn.BatchNorm1d(128)
self.bn2 = torch.nn.BatchNorm1d(64)
self.act1 = torch.nn.ReLU()
self.act2 = torch.nn.ReLU()
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch # x: n*1,其中每个图里点的个数是不同的
x = F.relu(self.conv1(x,edge_index)) # 768
x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
x1 = gap(x, batch) # 全局平均池化(上面得到的是每个点的特征,但是我们要做的是对图分类,得找到图的特征,把所有的点的特征加起来除以点的个数)
x = F.relu(self.conv2(x,edge_index))
x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
x2 = gap(x, batch)
x = F.relu(self.conv3(x,edge_index))
x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
x3 = gap(x, batch)
# x1,x2,x3全是128维度的向量,只不过是在不同的尺度上提取的
x = x1+x2+x3 # 获取不同尺度的全局特征(还是128维度,虽然特征会越来越小,但维度不变)
# 全连接层
# x = self.bn1(x)
x = self.lin1(x)
x = self.act1(x)
# x = self.bn2(x)
x = self.lin2(x)
x = self.act2(x)
x = F.dropout(x ,p=0.5, training=self.training)
x = torch.sigmoid(self.lin3(x)).squeeze(1)
return x
def train():
model.train()
loss_all = 0
for data in data_loader:
data = data
optimizer.zero_grad()
output = model(data)
label = data.y
loss = crit(output, label)
loss.backward()
loss_all += data.num_graphs * loss.item()
optimizer.step()
return loss_all / len(data_loader.dataset)
model = Net()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
crit = torch.nn.BCELoss()
for epoch in range(10):
print('epoch:',epoch)
loss = train()
print(loss)
方式二:调用geometric上现有的数据集(手动封装)
由于实验室的电脑没有联网不能下载网页上的数据集,所以我这里用google colab提供的服务器去实现。(这个服务器好就好在里面的一般常用的包都是配置好的,所以这里我只需要Install torch_geometric就可以用啦)
1、连接到云端硬盘
from google.colab import drive
drive.mount('/content/drive')
2、配置环境
!pip install torch-geometric
3、导包
import argparse
import os
import os.path as osp
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.datasets import UPFD #数据集名称
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, GCNConv, SAGEConv, global_max_pool #训练要用到的层
from torch_geometric.transforms import ToUndirected
4、加载数据集
class Args:
dataset = 'politifact' # 指定数据类型(gossipcop)
feature = 'spacy' #(['profile', 'spacy', 'bert', 'content'])
model = 'GCN' #(['GCN', 'GAT', 'SAGE'])
args = Args()
# 使用 os.getcwd() 获取当前工作目录
path = osp.join(os.getcwd(), '..', 'data', 'UPFD')
train_dataset = UPFD(path, args.dataset, args.feature, 'train', ToUndirected())
val_dataset = UPFD(path, args.dataset, args.feature, 'val', ToUndirected())
test_dataset = UPFD(path, args.dataset, args.feature, 'test', ToUndirected())
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
5、定义GCN网络
class Net(torch.nn.Module):
def __init__(self, model, in_channels, hidden_channels, out_channels, concat=False):
super().__init__()
self.concat = concat
if model == 'GCN':
self.conv1 = GCNConv(in_channels, hidden_channels)
elif model == 'SAGE':
self.conv1 = SAGEConv(in_channels, hidden_channels)
elif model == 'GAT':
self.conv1 = GATConv(in_channels, hidden_channels)
if self.concat:
self.lin0 = Linear(in_channels, hidden_channels)
self.lin1 = Linear(2 * hidden_channels, hidden_channels)
self.lin2 = Linear(hidden_channels, out_channels)
def forward(self, x, edge_index, batch):
h = self.conv1(x, edge_index).relu()
h = global_max_pool(h, batch)
if self.concat:
# Get the root node (tweet) features of each graph:
root = (batch[1:] - batch[:-1]).nonzero(as_tuple=False).view(-1)
root = torch.cat([root.new_zeros(1), root + 1], dim=0)
news = x[root]
news = self.lin0(news).relu()
h = self.lin1(torch.cat([news, h], dim=-1)).relu()
h = self.lin2(h)
return h.log_softmax(dim=-1)
6、确定模型、优化器等
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(args.model, train_dataset.num_features, 128,
train_dataset.num_classes, concat=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
7、定义训练、测试函数
def train():
model.train()
total_loss = 0
for data in train_loader:
data = data.to(device)
optimizer.zero_grad()
out = model(data.x, data.edge_index, data.batch)
loss = F.nll_loss(out, data.y)
loss.backward()
optimizer.step()
total_loss += float(loss) * data.num_graphs
return total_loss / len(train_loader.dataset)
@torch.no_grad()
def test(loader):
model.eval()
total_correct = total_examples = 0
for data in loader:
data = data.to(device)
pred = model(data.x, data.edge_index, data.batch).argmax(dim=-1)
total_correct += int((pred == data.y).sum())
total_examples += data.num_graphs
return total_correct / total_examples
8、训练
for epoch in range(1,61):
loss = train()
train_acc = test(train_loader)
val_acc = test(val_loader)
test_acc = test(test_loader)
print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')
如果想看一下刚才的数据集下载到了哪个路径下,
print(train_dataset.raw_dir)
但是这里还有一个问题,我的数据是在content文件下的,这个文件点进去貌似只有上传的功能,没有下载???
TopKPooling结构
其实就是对图进行剪枝操作,选择分低的节点剔除掉,然后再重新组合成新的图