基于PyG搭建GCN-LSTM时空犯罪预测
1.前言
最近针对犯罪时空预测、犯罪分布可视化开展研究,图神经网络是必不可少的研究工具之一,为了记录学习PyG的过程,本文通过结合官网案例(非常晦涩难懂)以及网上各位大佬的学习过程,撰写此文章,以此记录学习过程,以防后面遗忘,如有错误请嘴下留情。如果有用,感谢大家给我个关注。
2. PyG安装过程以及需要的包
此处省略安装过程
3.数据描述
本文采用的是美国纽约州的犯罪数据,具体可视化如图所示:
数据被分为训练集、验证集、测试集,数据形状为四维数据:[16, 16, 608, 4],其中分别代表划分成16*16的区域,每个区域上608个时间段,4种犯罪类型。输入是过去n天的犯罪矩阵,输出是n+1天的犯罪矩阵。
4.数据处理
用图神经网络的重要一步就是构建图,如何构建图呢?PyG还是比较人性的,在对数据进行转换后,通过KNN算法,筛选出邻近节点,然后构建边,这里也需要注意,数据的输入维度需要按照PyG指定的格式才行。(在此处进行更新,我们采用美国纽约市NYC Sectors构建图神经网络)。包括时间滑动窗口等
class MadridDatasetLoader(object):
def __init__(self, data_norm, adj, device="cuda"):
super(MadridDatasetLoader, self).__init__()
self.data = data_norm
self.adj = adj
def process_data(self, time_step, pre_step):
# 时间窗口处理
nodes, timeLength = self.data.shape[0], self.data.shape[1]
train_seq, train_label = [], []
for i in range(0, timeLength - time_step - pre_step + 1, pre_step):
train_seq.append(self.data[:, i: i + time_step])
train_label.append(self.data[:, i + time_step: i + time_step + pre_step])
self.features = train_seq
self.label = train_label
def generate_data(self, time_step=3, pre_step=1):
#这个地方处理图神经网络的地方
edge_weight = np.ones(Adj_matrix.shape[1])
self.process_data(time_step, pre_step)
dataset = torch_geometric_temporal.StaticGraphTemporalSignal(edge_index=self.adj.values.T, edge_weight=edge_weight,
features=self.features, targets=self.label)
return dataset
5.模型构建
此处构建一个GCN+LSTM和TCN模型,简单的来说,核心是有batchsize那么多个图,拼成一个大的,然后你分别用网络处理,处理后呢,再返回成batchsize那么多个图
class GCN(torch.nn.Module):
def __init__(self, in_feats, h_feats, out_feats,batchsize):
super(GCN, self).__init__()
self.conv1 = GCNConv(in_feats, h_feats)
self.conv2 = GCNConv(h_feats, out_feats)
self.fc = nn.Sequential(
nn.Linear(128, out_feats),
nn.ReLU(),
nn.Linear(out_feats, 1),
)
self.batch = batchsize
self.lstm = nn.LSTM(out_feats, 128, batch_first=True, dropout=0.5)
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch
x = x.float()
x = F.elu(self.conv1(x, edge_index))
x = self.conv2(x, edge_index))
x = x.view(len(x),1,-1)
x,_ = self.lstm(x)
out = self.fc(x)
out = torch.reshape(out,(self.batch,-1))
return out
class TGCN_model(torch.nn.Module):
def __init__(self, node_features, unit_num, out_step):
super(TGCN_model, self).__init__()
# Temporal Graph Convolutional Cell
self.tgnn_n = TGCN(in_channels=node_features,
out_channels=unit_num)
self.linear_n = torch.nn.Linear(unit_num, out_step)
def forward(self, x, edge_index, edge_weight):
"""
x = Node features for T time steps
edge_index = Graph edge indices
"""
mu_res = self.linear_n(F.softplus(self.tgnn_n(x, edge_index)))
return mu_res
6.训练和测试
我们在这个地方更新了训练函数和测试函数,验证集没写
def train(model, dataset, test_dataset):
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(100):
step = 0
loss_batch = []
for snapshot in dataset:
snapshot = snapshot.to('cuda')
# Get model predictions
if four == False:
phi_res, rou_res, mu_res = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
loss = nb_zeroinflated_nll_loss(snapshot.y, phi_res, rou_res, mu_res)
loss_batch.append(loss.item())
else:
phi_res, rou_res, mu_res, pi_res = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
loss = zb_tweedie_nll_loss(snapshot.y, phi_res, rou_res, mu_res, pi_res)
loss_batch.append(loss.item())
# Mean Squared Error
loss.backward()
optimizer.step()
optimizer.zero_grad()
torch.cuda.empty_cache()
print("Epoch {} train zb_Tweedie: {:.4f}".format(epoch, np.mean(loss_batch)))
result = test_point(model, test_dataset)
print("Epoch {} train zb_Tweedie: {:.4f} Mae: {:.4f}".format(epoch, np.mean(loss_batch), result))
def test(model, dataset):
with torch.no_grad():
model.eval()
prediction, true = [], []
for snapshot in dataset:
snapshot = snapshot.to('cuda')
# Get model predictions
if four == False:
phi_res, rou_res, mu_res = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
else:
phi_res, rou_res, mu_res, pi_res = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
point = nb_zeroinflated_draw(phi_res.cpu().numpy(), rou_res.cpu().numpy(), mu_res.cpu().numpy())
prediction.append(point)
true.append(snapshot.y)
# Flatten prediction and true lists into one-dimensional tensors
prediction_flat = torch.cat([p.flatten() for p in prediction])
true_flat = torch.cat([t.flatten() for t in true])
# Compute MAE
mae = metrics.mean_absolute_error(true_flat.numpy(), prediction_flat.numpy())
return mae
7.结果分析
由于实验不多,仅仅用了传统CNN算法、GCN算法、和GCN-LSTM算法进行分析,在Mape上指标差异巨大,GCN算法提升效果很高,感兴趣的可以自己试试,相对GCN呢,引入LSTM处理时序数据,在一定程度上可以提高点数。