数据处理,数据集采用cora数据集
import dgl
from dgl.data import DGLDataset
import torch
import os
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch.nn.functional as F
class CoraDataset(DGLDataset):
def __init__(self):
super().__init__(name='cora')
def process(self):
nodes_data = pd.read_csv(r'../data/cora/cora.content', sep='\t', header=None)
edges_data = pd.read_csv(r'../data/cora/cora.cites', sep='\t', header=None)
node_f = nodes_data.iloc[:, 1:-1]
node_len = nodes_data.iloc[:, 0]
node_l = nodes_data.iloc[:, -1]
edges_s = edges_data.iloc[:, 0]
edges_d = edges_data.iloc[:, 1]
idx = np.array(node_len, dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
#print(idx_map)
node_features = torch.from_numpy(node_f.to_numpy())
node_labels = torch.from_numpy(node_l.astype('category').cat.codes.to_numpy().astype(np.int64))
edges_src = torch.from_numpy(np.array(list(map(idx_map.get, edges_s))))
edges_dst = torch.from_numpy(np.array(list(map(idx_map.get, edges_d))))
#print(edges_src, edges_dst)
self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
self.graph.ndata['feat'] = node_features
self.graph.ndata['label'] = node_labels
n_nodes = nodes_data.shape[0]
n_train = int(n_nodes * 0.6)
n_val = int(n_nodes * 0.2)
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)
train_mask[:n_train] = True
val_mask[n_train:n_train + n_val] = True
test_mask[n_train + n_val:] = True
self.graph.ndata['train_mask'] = train_mask
self.graph.ndata['val_mask'] = val_mask
self.graph.ndata['test_mask'] = test_mask
def __getitem__(self, i):
return self.graph
def __len__(self):
return 1
数据 导入neo4j中查看是否存在孤立节点
#查找入度为0的节点
match (n) where not()-[]-(n) return n
查找出度为0的节点
match (n) where not(n)-[]-() return n
模型搭建及训练
from dgl.nn import GraphConv
import torch.nn as nn
import torch.nn.functional as F
import torch
import dgl
from DLG_GCN.dataprocess import CoraDataset
import matplotlib.pyplot as plt
import numpy as np
class GCN(nn.Module):
def __init__(self, in_feats, h_feats, num_classes):
super(GCN, self).__init__()
self.conv1 = GraphConv(in_feats, h_feats)
self.conv2 = GraphConv(h_feats, num_classes)
def forward(self, g, in_feat):
h = self.conv1(g, in_feat)
h = F.relu(h)
h = self.conv2(g, h)
return h
def train(g, model):
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0005)
best_val_acc = 0
best_test_acc = 0
fig_loss = []
fig_test_acc = []
fig_val_acc = []
features = g.ndata['feat']
labels = g.ndata['label']
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']
for epoch in range(1000):
logits = model(g, features)
pred = logits.argmax(1)
loss = F.cross_entropy(logits[train_mask], labels[train_mask])
# Compute accuracy on training/validation/test
train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
test_acc = (pred[test_mask] == labels[test_mask]).float().mean()
fig_loss.append(loss)
fig_val_acc.append(val_acc)
fig_test_acc.append(test_acc)
if best_val_acc < val_acc:
best_val_acc = val_acc
best_test_acc = test_acc
# Backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 5 == 0:
print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
epoch, loss, val_acc, best_val_acc, test_acc, best_test_acc))
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ln1 = plt.plot(np.arange(len(fig_loss)), fig_loss, 'r', label='loss')
ln2 = plt.plot(np.arange(len(fig_test_acc)), fig_test_acc, 'g', label='test_acc')
ln3 = plt.plot(np.arange(len(fig_val_acc)), fig_val_acc, 'b', label='val_acc')
ax1.set_xlabel('iteration')
ax2.set_ylabel('training loss')
ax2.set_ylabel('training accuracy')
lns = ln1 + ln2 +ln3
labels = ["train_loss", "test_acc", "val_acc"]
# labels = [l.get_label() for l in lns]
plt.legend(lns, labels, loc='upper left')
plt.grid(True)
plt.savefig("./mydata/train_val_test.png")
plt.figure(2)
plt.plot(np.arange(len(fig_loss)), fig_loss, 'r', label='loss')
plt.legend(labels=['loss'])
plt.grid(True)
plt.savefig("./mydata/train.png")
plt.figure(3)
plt.plot(np.arange(len(fig_val_acc)), fig_val_acc, 'g', label='val_accuracy')
plt.legend(labels=["test_acuracy"])
plt.grid(True)
plt.savefig("./mydata/val.png")
plt.figure(4)
plt.plot(np.arange(len(fig_test_acc)), fig_test_acc, 'b', label='test_accuracy')
plt.legend(labels=["test_acuracy"])
plt.grid(True)
plt.savefig("./mydata/test.png")
plt.show()
cora_data = CoraDataset()
graph = cora_data[0]
g = dgl.to_bidirected(graph, copy_ndata=True)
model = GCN(g.ndata['feat'].shape[1], 16, 7)
train(g, model)
训练结果