图神经网络学习心得

学习目标:

提示:学习图神经网络
推荐:百度的图神经网络7日打卡训练营


学习内容:

百度的图神经网络框架

1、 搭建飞浆(paddlepaddle)开发环境

conda activate paddle_gpu
python -m pip install paddlepaddle-gpu==1.8.3.post107 -i https://mirror.baidu.com/pypi/simple

2、 搭建 pgl 开发环境

!pip install pgl

3、 掌握图神经网络算法 这里描述一下百度研发出来的 ErnieSage算法
4、 pgl实现图神经网络算法


ErnieSage算法:

提示:这里可以添加计划学习的时间
1、Ernie 背景知识
在这里插入图片描述

2、 ERNIE 核心思想
在这里插入图片描述
在这里插入图片描述

3、Ernie用在图结构上 即ErnieSage算法
在这里插入图片描述
3.1 ErnieSage Node
在这里插入图片描述
在这里插入图片描述

3.2 ErnieSage Edge
在这里插入图片描述
在这里插入图片描述


PGL 代码实现ErnieSage:

1、数据

1.1输入example数据集

! head -n 3 example_data/link_predict/graph_data.txt
! wc -l example_data/link_predict/graph_data.txt

1.2 表达一个文本图


from preprocessing.dump_graph import dump_graph
from preprocessing.dump_graph import dump_node_feat
from preprocessing.dump_graph import download_ernie_model
from preprocessing.dump_graph import load_config
from pgl.graph_wrapper import BatchGraphWrapper
import propeller.paddle as propeller
import paddle.fluid as F
import paddle.fluid.layers as L
import numpy as np
from preprocessing.dump_graph import load_config
from models.pretrain_model_loader import PretrainedModelLoader
from pgl.graph import MemmapGraph
from models.encoder import linear
from ernie import ErnieModel
np.random.seed(123)
config = load_config("./config/erniesage_link_predict.yaml")
# 将原始QA数据产出一个文本图,并使用grpah.dump存放到 workdir 目录下
dump_graph(config)
dump_node_feat(config)
# MemmapGraph可以将PGL中graph.dump的模型,重新load回来
graph = MemmapGraph("./workdir/") 
# 看一下图基础信息
print("节点", graph.num_nodes,"个") 
print("边", graph.edges, graph.edges.shape)
# 看一下节点特征
print([("%s shape is %s" % (key, str(graph.node_feat[key].shape))) for key in graph.node_feat])
print(graph.node_feat) #  按字的粒度转成ID,每段文本为一个节点,文本全部保留40长度
# 1021个节点,每个节点有长度为40的id序列

2、模型
2.1 ERNIESage V1 核心代码

# ERNIESage V1,ERNIE作用在节点上
class ERNIESageV1Encoder():
    def __init__(self, config):
        self.config = config

    def __call__(self, graph_wrappers, inputs):
        
        # step1. ERNIE提取节点语义
        # 输入每个节点的文本的id序列
        term_ids = graph_wrappers[0].node_feat["term_ids"]
        
        cls = L.fill_constant_batch_size_like(term_ids, [-1, 1], "int64",
                                              self.config.cls_id) # cls [B, 1]
        term_ids = L.concat([cls, term_ids], 1) # term_ids [B, S]
        # [CLS], id1, id2, id3 .. [SEP]

        ernie_model = ErnieModel(self.config.ernie_config) 
        # 获得ERNIE的[CLS]位置的表达
        cls_feat, _ = ernie_model(term_ids) # cls_feat [B, F]

        # step2. GNN聚合
        feature = graphsage_sum(cls_feat, graph_wrappers[0], self.config.hidden_size, "v1_graphsage_sum", "leaky_relu")
        
        final_feats = [
            self.take_final_feature(feature, i, "v1_final_fc") for i in inputs
        ]
        return final_feats
    
    def take_final_feature(self, feature, index, name):
        """take final feature"""
        feat = L.gather(feature, index, overwrite=False)
        feat = linear(feat, self.config.hidden_size, name)
        feat = L.l2_normalize(feat, axis=1)
        return feat


def graphsage_sum(feature, gw, hidden_size, name, act):
    # copy_send
    msg = gw.send(lambda src, dst, edge: src["h"], nfeat_list=[("h", feature)])
    # sum_recv
    neigh_feature = gw.recv(msg, lambda feat: L.sequence_pool(feat, pool_type="sum"))

    self_feature = linear(feature, hidden_size, name+"_l", act)
    neigh_feature = linear(neigh_feature, hidden_size, name+"_r", act)
    output = L.concat([self_feature, neigh_feature], axis=1) # [B, 2H]
    output = L.l2_normalize(output, axis=1)
    return output
# 随机构造些数据
feat_size = 40
feed_dict = {
    "num_nodes": np.array([4]),
    "num_edges": np.array([6]),
    "edges": np.array([[0,1],[1,0],[0,2],[2,0],[0,3],[3,0]]),
    "term_ids": np.random.randint(4, 10000, size=(4, feat_size)),
    "inputs": np.array([0])}
place = F.CUDAPlace(0)
exe = F.Executor(place)
# 模型v1
erniesage_v1_encoder = ERNIESageV1Encoder(config)

main_prog, start_prog = F.Program(), F.Program()
with F.program_guard(main_prog, start_prog):
    with F.unique_name.guard():
        num_nodes = L.data("num_nodes", [1], False, 'int64')
        num_edges = L.data("num_edges", [1], False, 'int64')
        edges = L.data("edges", [-1, 2], False, 'int64')
        node_feat = L.data("term_ids", [-1, 40], False, 'int64')
        inputs = L.data("inputs", [-1], False, 'int64')

        # 输入图的基本信息(边、点、特征)构造一个graph 
        gw = BatchGraphWrapper(num_nodes, num_edges, edges, {"term_ids": node_feat})
        outputs = erniesage_v1_encoder([gw], [inputs])

exe.run(start_prog)
outputs_np = exe.run(main_prog, feed=feed_dict, fetch_list=[outputs])[0]
print(outputs_np)

2.2 ERNIESage V2 核心代码

# ERNIESage V2,ERNIE作用在边上
class ERNIESageV2Encoder():
    def __init__(self, config):
        self.config = config

    def __call__(self, graph_wrappers, inputs):
        gw = graph_wrappers[0]
        term_ids = gw.node_feat["term_ids"] # term_ids [B, S]
        
        # step1. GNN send 文本id
        def ernie_send(src_feat, dst_feat, edge_feat):
            def build_position_ids(term_ids):
                input_mask = L.cast(term_ids > 0, "int64")
                position_ids = L.cumsum(input_mask, axis=1) - 1
                return position_ids
            
            # src_ids, dst_ids 为发送src和接收dst节点分别的文本ID序列
            src_ids, dst_ids = src_feat["term_ids"], dst_feat["term_ids"]

            # 生成[CLS]对应的id列, 并与前半段concat
            cls = L.fill_constant_batch_size_like(
                src_feat["term_ids"], [-1, 1], "int64", self.config.cls_id) # cls [B, 1]
            src_ids = L.concat([cls, src_ids], 1) # src_ids [B, S+1]

            # 将src与dst concat在一起作为完整token ids
            term_ids = L.concat([src_ids, dst_ids], 1) # term_ids [B, 2S+1]
            # [CLS], src_id1, src_id2.. [SEP], dst_id1, dst_id2..[SEP]

            sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
            #   0, 0, 0 .. 0, 1, 1 .. 1 

            position_ids = build_position_ids(term_ids)
            #   0, 1, 2, 3 ..  
            
            # step2. ERNIE提取边语义 
            ernie_model = ErnieModel(self.config.ernie_config)
            cls_feat, _ = ernie_model(term_ids, sent_ids, position_ids)
            # cls_feat 为ERNIE提取的句子级隐向量表达
            return cls_feat

        msg = gw.send(ernie_send, nfeat_list=[("term_ids", term_ids)])
        
        # step3. GNN recv 聚合邻居语义 
        # 接收了邻居的CLS语义表达,sum聚合在一起
        neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum"))

        # 为每个节点也拼接一个CLS表达
        cls = L.fill_constant_batch_size_like(term_ids, [-1, 1],
                                              "int64", self.config.cls_id)
        
        term_ids = L.concat([cls, term_ids], 1)
        # [CLS], id1, id2, ... [SEP]
        
        # step4. ERNIE提取中心节点语义并concat
        # 对中心节点过一次ERNIE    
        ernie_model = ErnieModel(self.config.ernie_config)

        # 获取中心节点的语义CLS表达
        self_cls_feat, _ = ernie_model(term_ids)

        hidden_size = self.config.hidden_size        
        self_feature = linear(self_cls_feat, hidden_size, "erniesage_v2_l", "leaky_relu")
        neigh_feature = linear(neigh_feature, hidden_size, "erniesage_v2_r", "leaky_relu")
        output = L.concat([self_feature, neigh_feature], axis=1)
        output = L.l2_normalize(output, axis=1)

        final_feats = [
            self.take_final_feature(output, i, "v2_final_fc") for i in inputs
        ]
        return final_feats

    def take_final_feature(self, feature, index, name):
        """take final feature"""
        feat = L.gather(feature, index, overwrite=False)
        feat = linear(feat, self.config.hidden_size, name)
        feat = L.l2_normalize(feat, axis=1)
        return feat
# 直接run一下
erniesage_v2_encoder = ERNIESageV2Encoder(config)

main_prog, start_prog = F.Program(), F.Program()
with F.program_guard(main_prog, start_prog):
    with F.unique_name.guard():
        num_nodes = L.data("num_nodes", [1], False, 'int64')
        num_edges = L.data("num_edges", [1], False, 'int64')
        edges = L.data("edges", [-1, 2], False, 'int64')
        node_feat = L.data("term_ids", [10, 40], False, 'int64')
        inputs = L.data("inputs", [2], False, 'int64')

        gw = BatchGraphWrapper(num_nodes, num_edges, edges, {"term_ids": node_feat})
        outputs = erniesage_v2_encoder([gw], [inputs])

exe = F.Executor(place)
exe.run(start_prog)
outputs_np = exe.run(main_prog, feed=feed_dict, fetch_list=[outputs])[0]
print(outputs_np)

3、 训练
3.1 以一个link predict的任务为例,读取一个语义图,以上面的边为目标进行无监督的训练

class ERNIESageLinkPredictModel(propeller.train.Model):
    def __init__(self, hparam, mode, run_config):
        self.hparam = hparam
        self.mode = mode
        self.run_config = run_config

    def forward(self, features):
        num_nodes, num_edges, edges, node_feat_index, node_feat_term_ids, user_index, \
            pos_item_index, neg_item_index, user_real_index, pos_item_real_index = features

        node_feat = {"index": node_feat_index, "term_ids": node_feat_term_ids}
        graph_wrapper = BatchGraphWrapper(num_nodes, num_edges, edges,
                                          node_feat)

        #encoder = ERNIESageV1Encoder(self.hparam)
        encoder = ERNIESageV2Encoder(self.hparam)
        #encoder = ERNIESageV3Encoder(self.hparam)

        # 中心节点、邻居节点、随机采样节点 分别提取特征
        outputs = encoder([graph_wrapper],
                          [user_index, pos_item_index, neg_item_index])
        user_feat, pos_item_feat, neg_item_feat = outputs
    
        if self.mode is not propeller.RunMode.PREDICT:
            return user_feat, pos_item_feat, neg_item_feat
        else:
            return user_feat, user_real_index

    def loss(self, predictions, labels):
        user_feat, pos_item_feat, neg_item_feat = predictions
        pos = L.reduce_sum(user_feat * pos_item_feat, -1, keep_dim=True) # 
        #neg = L.reduce_sum(user_feat * neg_item_feat, -1, keep_dim=True)# 60.
        neg = L.matmul(user_feat, neg_item_feat, transpose_y=True) # 80.
        # 距离(中心,邻居)> 距离(中心,随机负)
        loss = L.reduce_mean(L.relu(neg - pos + self.hparam.margin))
        return loss

    def backward(self, loss):
        adam = F.optimizer.Adam(learning_rate=self.hparam['learning_rate'])
        adam.minimize(loss)

    def metrics(self, predictions, label):
        return {}
from link_predict import train
from link_predict import predict

train(config, ERNIESageLinkPredictModel)
predict(config, ERNIESageLinkPredictModel)
! head output/part-0

3.2 评价

!python build_dev.py --path "./example_data/link_predict/graph_data.txt" # 此命令用于将训练数据输出为需要的格式,产生的文件为dev_out.txt
# 接下来,计算MRR得分。
# 注意,运行此代码的前提是,我们已经将config对应的yaml配置文件中的input_data参数修改为了:"data.txt"
# 并且注意训练的模型是针对data.txt的,如果不符合,请重新训练模型。
!python mrr.py --emb_path output/part-0

学习心得:

通过这次参加百度的图神经网络的7日打卡训练营,收获颇多。最高兴的是这次课程是免费的,云GPU算力也是免费的,还有专业的答疑团队。除了学习知识以外,还有作业实践来巩固,了解了百度的深度学习框架飞浆和对应的图框架PGL。在做作业和比赛的过程中,通过不断调参又提升了炼丹功力。对图神经网络的应用面也有所了解,只有问题能构成一个图结构就可以用图神经网络解决,事实上任何事物之间都可以构成图结构,即万物皆可图!
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值