学习目标:
提示:学习图神经网络
推荐:百度的图神经网络7日打卡训练营
学习内容:
百度的图神经网络框架
1、 搭建飞浆(paddlepaddle)开发环境
conda activate paddle_gpu
python -m pip install paddlepaddle-gpu==1.8.3.post107 -i https://mirror.baidu.com/pypi/simple
2、 搭建 pgl 开发环境
!pip install pgl
3、 掌握图神经网络算法 这里描述一下百度研发出来的 ErnieSage算法
4、 pgl实现图神经网络算法
ErnieSage算法:
提示:这里可以添加计划学习的时间
1、Ernie 背景知识
2、 ERNIE 核心思想
3、Ernie用在图结构上 即ErnieSage算法
3.1 ErnieSage Node
3.2 ErnieSage Edge
PGL 代码实现ErnieSage:
1、数据
1.1输入example数据集
! head -n 3 example_data/link_predict/graph_data.txt
! wc -l example_data/link_predict/graph_data.txt
1.2 表达一个文本图
from preprocessing.dump_graph import dump_graph
from preprocessing.dump_graph import dump_node_feat
from preprocessing.dump_graph import download_ernie_model
from preprocessing.dump_graph import load_config
from pgl.graph_wrapper import BatchGraphWrapper
import propeller.paddle as propeller
import paddle.fluid as F
import paddle.fluid.layers as L
import numpy as np
from preprocessing.dump_graph import load_config
from models.pretrain_model_loader import PretrainedModelLoader
from pgl.graph import MemmapGraph
from models.encoder import linear
from ernie import ErnieModel
np.random.seed(123)
config = load_config("./config/erniesage_link_predict.yaml")
# 将原始QA数据产出一个文本图,并使用grpah.dump存放到 workdir 目录下
dump_graph(config)
dump_node_feat(config)
# MemmapGraph可以将PGL中graph.dump的模型,重新load回来
graph = MemmapGraph("./workdir/")
# 看一下图基础信息
print("节点", graph.num_nodes,"个")
print("边", graph.edges, graph.edges.shape)
# 看一下节点特征
print([("%s shape is %s" % (key, str(graph.node_feat[key].shape))) for key in graph.node_feat])
print(graph.node_feat) # 按字的粒度转成ID,每段文本为一个节点,文本全部保留40长度
# 1021个节点,每个节点有长度为40的id序列
2、模型
2.1 ERNIESage V1 核心代码
# ERNIESage V1,ERNIE作用在节点上
class ERNIESageV1Encoder():
def __init__(self, config):
self.config = config
def __call__(self, graph_wrappers, inputs):
# step1. ERNIE提取节点语义
# 输入每个节点的文本的id序列
term_ids = graph_wrappers[0].node_feat["term_ids"]
cls = L.fill_constant_batch_size_like(term_ids, [-1, 1], "int64",
self.config.cls_id) # cls [B, 1]
term_ids = L.concat([cls, term_ids], 1) # term_ids [B, S]
# [CLS], id1, id2, id3 .. [SEP]
ernie_model = ErnieModel(self.config.ernie_config)
# 获得ERNIE的[CLS]位置的表达
cls_feat, _ = ernie_model(term_ids) # cls_feat [B, F]
# step2. GNN聚合
feature = graphsage_sum(cls_feat, graph_wrappers[0], self.config.hidden_size, "v1_graphsage_sum", "leaky_relu")
final_feats = [
self.take_final_feature(feature, i, "v1_final_fc") for i in inputs
]
return final_feats
def take_final_feature(self, feature, index, name):
"""take final feature"""
feat = L.gather(feature, index, overwrite=False)
feat = linear(feat, self.config.hidden_size, name)
feat = L.l2_normalize(feat, axis=1)
return feat
def graphsage_sum(feature, gw, hidden_size, name, act):
# copy_send
msg = gw.send(lambda src, dst, edge: src["h"], nfeat_list=[("h", feature)])
# sum_recv
neigh_feature = gw.recv(msg, lambda feat: L.sequence_pool(feat, pool_type="sum"))
self_feature = linear(feature, hidden_size, name+"_l", act)
neigh_feature = linear(neigh_feature, hidden_size, name+"_r", act)
output = L.concat([self_feature, neigh_feature], axis=1) # [B, 2H]
output = L.l2_normalize(output, axis=1)
return output
# 随机构造些数据
feat_size = 40
feed_dict = {
"num_nodes": np.array([4]),
"num_edges": np.array([6]),
"edges": np.array([[0,1],[1,0],[0,2],[2,0],[0,3],[3,0]]),
"term_ids": np.random.randint(4, 10000, size=(4, feat_size)),
"inputs": np.array([0])}
place = F.CUDAPlace(0)
exe = F.Executor(place)
# 模型v1
erniesage_v1_encoder = ERNIESageV1Encoder(config)
main_prog, start_prog = F.Program(), F.Program()
with F.program_guard(main_prog, start_prog):
with F.unique_name.guard():
num_nodes = L.data("num_nodes", [1], False, 'int64')
num_edges = L.data("num_edges", [1], False, 'int64')
edges = L.data("edges", [-1, 2], False, 'int64')
node_feat = L.data("term_ids", [-1, 40], False, 'int64')
inputs = L.data("inputs", [-1], False, 'int64')
# 输入图的基本信息(边、点、特征)构造一个graph
gw = BatchGraphWrapper(num_nodes, num_edges, edges, {"term_ids": node_feat})
outputs = erniesage_v1_encoder([gw], [inputs])
exe.run(start_prog)
outputs_np = exe.run(main_prog, feed=feed_dict, fetch_list=[outputs])[0]
print(outputs_np)
2.2 ERNIESage V2 核心代码
# ERNIESage V2,ERNIE作用在边上
class ERNIESageV2Encoder():
def __init__(self, config):
self.config = config
def __call__(self, graph_wrappers, inputs):
gw = graph_wrappers[0]
term_ids = gw.node_feat["term_ids"] # term_ids [B, S]
# step1. GNN send 文本id
def ernie_send(src_feat, dst_feat, edge_feat):
def build_position_ids(term_ids):
input_mask = L.cast(term_ids > 0, "int64")
position_ids = L.cumsum(input_mask, axis=1) - 1
return position_ids
# src_ids, dst_ids 为发送src和接收dst节点分别的文本ID序列
src_ids, dst_ids = src_feat["term_ids"], dst_feat["term_ids"]
# 生成[CLS]对应的id列, 并与前半段concat
cls = L.fill_constant_batch_size_like(
src_feat["term_ids"], [-1, 1], "int64", self.config.cls_id) # cls [B, 1]
src_ids = L.concat([cls, src_ids], 1) # src_ids [B, S+1]
# 将src与dst concat在一起作为完整token ids
term_ids = L.concat([src_ids, dst_ids], 1) # term_ids [B, 2S+1]
# [CLS], src_id1, src_id2.. [SEP], dst_id1, dst_id2..[SEP]
sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
# 0, 0, 0 .. 0, 1, 1 .. 1
position_ids = build_position_ids(term_ids)
# 0, 1, 2, 3 ..
# step2. ERNIE提取边语义
ernie_model = ErnieModel(self.config.ernie_config)
cls_feat, _ = ernie_model(term_ids, sent_ids, position_ids)
# cls_feat 为ERNIE提取的句子级隐向量表达
return cls_feat
msg = gw.send(ernie_send, nfeat_list=[("term_ids", term_ids)])
# step3. GNN recv 聚合邻居语义
# 接收了邻居的CLS语义表达,sum聚合在一起
neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum"))
# 为每个节点也拼接一个CLS表达
cls = L.fill_constant_batch_size_like(term_ids, [-1, 1],
"int64", self.config.cls_id)
term_ids = L.concat([cls, term_ids], 1)
# [CLS], id1, id2, ... [SEP]
# step4. ERNIE提取中心节点语义并concat
# 对中心节点过一次ERNIE
ernie_model = ErnieModel(self.config.ernie_config)
# 获取中心节点的语义CLS表达
self_cls_feat, _ = ernie_model(term_ids)
hidden_size = self.config.hidden_size
self_feature = linear(self_cls_feat, hidden_size, "erniesage_v2_l", "leaky_relu")
neigh_feature = linear(neigh_feature, hidden_size, "erniesage_v2_r", "leaky_relu")
output = L.concat([self_feature, neigh_feature], axis=1)
output = L.l2_normalize(output, axis=1)
final_feats = [
self.take_final_feature(output, i, "v2_final_fc") for i in inputs
]
return final_feats
def take_final_feature(self, feature, index, name):
"""take final feature"""
feat = L.gather(feature, index, overwrite=False)
feat = linear(feat, self.config.hidden_size, name)
feat = L.l2_normalize(feat, axis=1)
return feat
# 直接run一下
erniesage_v2_encoder = ERNIESageV2Encoder(config)
main_prog, start_prog = F.Program(), F.Program()
with F.program_guard(main_prog, start_prog):
with F.unique_name.guard():
num_nodes = L.data("num_nodes", [1], False, 'int64')
num_edges = L.data("num_edges", [1], False, 'int64')
edges = L.data("edges", [-1, 2], False, 'int64')
node_feat = L.data("term_ids", [10, 40], False, 'int64')
inputs = L.data("inputs", [2], False, 'int64')
gw = BatchGraphWrapper(num_nodes, num_edges, edges, {"term_ids": node_feat})
outputs = erniesage_v2_encoder([gw], [inputs])
exe = F.Executor(place)
exe.run(start_prog)
outputs_np = exe.run(main_prog, feed=feed_dict, fetch_list=[outputs])[0]
print(outputs_np)
3、 训练
3.1 以一个link predict的任务为例,读取一个语义图,以上面的边为目标进行无监督的训练
class ERNIESageLinkPredictModel(propeller.train.Model):
def __init__(self, hparam, mode, run_config):
self.hparam = hparam
self.mode = mode
self.run_config = run_config
def forward(self, features):
num_nodes, num_edges, edges, node_feat_index, node_feat_term_ids, user_index, \
pos_item_index, neg_item_index, user_real_index, pos_item_real_index = features
node_feat = {"index": node_feat_index, "term_ids": node_feat_term_ids}
graph_wrapper = BatchGraphWrapper(num_nodes, num_edges, edges,
node_feat)
#encoder = ERNIESageV1Encoder(self.hparam)
encoder = ERNIESageV2Encoder(self.hparam)
#encoder = ERNIESageV3Encoder(self.hparam)
# 中心节点、邻居节点、随机采样节点 分别提取特征
outputs = encoder([graph_wrapper],
[user_index, pos_item_index, neg_item_index])
user_feat, pos_item_feat, neg_item_feat = outputs
if self.mode is not propeller.RunMode.PREDICT:
return user_feat, pos_item_feat, neg_item_feat
else:
return user_feat, user_real_index
def loss(self, predictions, labels):
user_feat, pos_item_feat, neg_item_feat = predictions
pos = L.reduce_sum(user_feat * pos_item_feat, -1, keep_dim=True) #
#neg = L.reduce_sum(user_feat * neg_item_feat, -1, keep_dim=True)# 60.
neg = L.matmul(user_feat, neg_item_feat, transpose_y=True) # 80.
# 距离(中心,邻居)> 距离(中心,随机负)
loss = L.reduce_mean(L.relu(neg - pos + self.hparam.margin))
return loss
def backward(self, loss):
adam = F.optimizer.Adam(learning_rate=self.hparam['learning_rate'])
adam.minimize(loss)
def metrics(self, predictions, label):
return {}
from link_predict import train
from link_predict import predict
train(config, ERNIESageLinkPredictModel)
predict(config, ERNIESageLinkPredictModel)
! head output/part-0
3.2 评价
!python build_dev.py --path "./example_data/link_predict/graph_data.txt" # 此命令用于将训练数据输出为需要的格式,产生的文件为dev_out.txt
# 接下来,计算MRR得分。
# 注意,运行此代码的前提是,我们已经将config对应的yaml配置文件中的input_data参数修改为了:"data.txt"
# 并且注意训练的模型是针对data.txt的,如果不符合,请重新训练模型。
!python mrr.py --emb_path output/part-0