图神经网络学习心得

最新推荐文章于 2024-06-12 13:38:31 发布

沙波什尼科夫

最新推荐文章于 2024-06-12 13:38:31 发布

阅读量707

点赞数

分类专栏：图神经网络文章标签：深度学习

原文链接：https://github.com/PaddlePaddle/PGL/tree/main/course

版权

图神经网络专栏收录该内容

1 篇文章 0 订阅

订阅专栏

学习目标：

提示：学习图神经网络
推荐：百度的图神经网络7日打卡训练营

学习内容：

百度的图神经网络框架

1、搭建飞浆(paddlepaddle)开发环境

conda activate paddle_gpu
python -m pip install paddlepaddle-gpu==1.8.3.post107 -i https://mirror.baidu.com/pypi/simple

2、搭建 pgl 开发环境

!pip install pgl

3、掌握图神经网络算法这里描述一下百度研发出来的 ErnieSage算法
4、 pgl实现图神经网络算法

ErnieSage算法：

提示：这里可以添加计划学习的时间
1、Ernie 背景知识
在这里插入图片描述

2、 ERNIE 核心思想
在这里插入图片描述

3、Ernie用在图结构上即ErnieSage算法
在这里插入图片描述
3.1 ErnieSage Node

3.2 ErnieSage Edge
在这里插入图片描述

PGL 代码实现ErnieSage：

1、数据

1.1输入example数据集

! head -n 3 example_data/link_predict/graph_data.txt
! wc -l example_data/link_predict/graph_data.txt

1.2 表达一个文本图


from preprocessing.dump_graph import dump_graph
from preprocessing.dump_graph import dump_node_feat
from preprocessing.dump_graph import download_ernie_model
from preprocessing.dump_graph import load_config
from pgl.graph_wrapper import BatchGraphWrapper
import propeller.paddle as propeller
import paddle.fluid as F
import paddle.fluid.layers as L
import numpy as np
from preprocessing.dump_graph import load_config
from models.pretrain_model_loader import PretrainedModelLoader
from pgl.graph import MemmapGraph
from models.encoder import linear
from ernie import ErnieModel
np.random.seed(123)
config = load_config("./config/erniesage_link_predict.yaml")

# 将原始QA数据产出一个文本图，并使用grpah.dump存放到 workdir 目录下
dump_graph(config)
dump_node_feat(config)

# MemmapGraph可以将PGL中graph.dump的模型，重新load回来
graph = MemmapGraph("./workdir/") 
# 看一下图基础信息
print("节点", graph.num_nodes,"个") 
print("边", graph.edges, graph.edges.shape)

# 看一下节点特征
print([("%s shape is %s" % (key, str(graph.node_feat[key].shape))) for key in graph.node_feat])
print(graph.node_feat) #  按字的粒度转成ID，每段文本为一个节点，文本全部保留40长度
# 1021个节点，每个节点有长度为40的id序列

2、模型
2.1 ERNIESage V1 核心代码

# ERNIESage V1，ERNIE作用在节点上
class ERNIESageV1Encoder():
    def __init__(self, config):
        self.config = config

    def __call__(self, graph_wrappers, inputs):
        
        # step1. ERNIE提取节点语义
        # 输入每个节点的文本的id序列
        term_ids = graph_wrappers[0].node_feat["term_ids"]
        
        cls = L.fill_constant_batch_size_like(term_ids, [-1, 1], "int64",
                                              self.config.cls_id) # cls [B, 1]
        term_ids = L.concat([cls, term_ids], 1) # term_ids [B, S]
        # [CLS], id1, id2, id3 .. [SEP]

        ernie_model = ErnieModel(self.config.ernie_config) 
        # 获得ERNIE的[CLS]位置的表达
        cls_feat, _ = ernie_model(term_ids) # cls_feat [B, F]

        # step2. GNN聚合
        feature = graphsage_sum(cls_feat, graph_wrappers[0], self.config.hidden_size, "v1_graphsage_sum", "leaky_relu")
        
        final_feats = [
            self.take_final_feature(feature, i, "v1_final_fc") for i in inputs
        ]
        return final_feats
    
    def take_final_feature(self, feature, index, name):
        """take final feature"""
        feat = L.gather(feature, index, overwrite=False)
        feat = linear(feat, self.config.hidden_size, name)
        feat = L.l2_normalize(feat, axis=1)
        return feat


def graphsage_sum(feature, gw, hidden_size, name, act):
    # copy_send
    msg = gw.send(lambda src, dst, edge: src["h"], nfeat_list=[("h", feature)])
    # sum_recv
    neigh_feature = gw.recv(msg, lambda feat: L.sequence_pool(feat, pool_type="sum"))

    self_feature = linear(feature, hidden_size, name+"_l", act)
    neigh_feature = linear(neigh_feature, hidden_size, name+"_r", act)
    output = L.concat([self_feature, neigh_feature], axis=1) # [B, 2H]
    output = L.l2_normalize(output, axis=1)
    return output

# 随机构造些数据
feat_size = 40
feed_dict = {
    "num_nodes": np.array([4]),
    "num_edges": np.array([6]),
    "edges": np.array([[0,1],[1,0],[0,2],[2,0],[0,3],[3,0]]),
    "term_ids": np.random.randint(4, 10000, size=(4, feat_size)),
    "inputs": np.array([0])}
place = F.CUDAPlace(0)
exe = F.Executor(place)

# 模型v1
erniesage_v1_encoder = ERNIESageV1Encoder(config)

main_prog, start_prog = F.Program(), F.Program()
with F.program_guard(main_prog, start_prog):
    with F.unique_name.guard():
        num_nodes = L.data("num_nodes", [1], False, 'int64')
        num_edges = L.data("num_edges", [1], False, 'int64')
        edges = L.data("edges", [-1, 2], False, 'int64')
        node_feat = L.data("term_ids", [-1, 40], False, 'int64')
        inputs = L.data("inputs", [-1], False, 'int64')

        # 输入图的基本信息（边、点、特征）构造一个graph 
        gw = BatchGraphWrapper(num_nodes, num_edges, edges, {"term_ids": node_feat})
        outputs = erniesage_v1_encoder([gw], [inputs])

exe.run(start_prog)
outputs_np = exe.run(main_prog, feed=feed_dict, fetch_list=[outputs])[0]
print(outputs_np)

2.2 ERNIESage V2 核心代码

# ERNIESage V2，ERNIE作用在边上
class ERNIESageV2Encoder():
    def __init__(self, config):
        self.config = config

    def __call__(self, graph_wrappers, inputs):
        gw = graph_wrappers[0]
        term_ids = gw.node_feat["term_ids"] # term_ids [B, S]
        
        # step1. GNN send 文本id
        def ernie_send(src_feat, dst_feat, edge_feat):
            def build_position_ids(term_ids):
                input_mask = L.cast(term_ids > 0, "int64")
                position_ids = L.cumsum(input_mask, axis=1) - 1
                return position_ids
            
            # src_ids, dst_ids 为发送src和接收dst节点分别的文本ID序列
            src_ids, dst_ids = src_feat["term_ids"], dst_feat["term_ids"]

            # 生成[CLS]对应的id列, 并与前半段concat
            cls = L.fill_constant_batch_size_like(
                src_feat["term_ids"], [-1, 1], "int64", self.config.cls_id) # cls [B, 1]
            src_ids = L.concat([cls, src_ids], 1) # src_ids [B, S+1]

            # 将src与dst concat在一起作为完整token ids
            term_ids = L.concat([src_ids, dst_ids], 1) # term_ids [B, 2S+1]
            # [CLS], src_id1, src_id2.. [SEP], dst_id1, dst_id2..[SEP]

            sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
            #   0, 0, 0 .. 0, 1, 1 .. 1 

            position_ids = build_position_ids(term_ids)
            #   0, 1, 2, 3 ..  
            
            # step2. ERNIE提取边语义 
            ernie_model = ErnieModel(self.config.ernie_config)
            cls_feat, _ = ernie_model(term_ids, sent_ids, position_ids)
            # cls_feat 为ERNIE提取的句子级隐向量表达
            return cls_feat

        msg = gw.send(ernie_send, nfeat_list=[("term_ids", term_ids)])
        
        # step3. GNN recv 聚合邻居语义 
        # 接收了邻居的CLS语义表达，sum聚合在一起
        neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum"))

        # 为每个节点也拼接一个CLS表达
        cls = L.fill_constant_batch_size_like(term_ids, [-1, 1],
                                              "int64", self.config.cls_id)
        
        term_ids = L.concat([cls, term_ids], 1)
        # [CLS], id1, id2, ... [SEP]
        
        # step4. ERNIE提取中心节点语义并concat
        # 对中心节点过一次ERNIE    
        ernie_model = ErnieModel(self.config.ernie_config)

        # 获取中心节点的语义CLS表达
        self_cls_feat, _ = ernie_model(term_ids)

        hidden_size = self.config.hidden_size        
        self_feature = linear(self_cls_feat, hidden_size, "erniesage_v2_l", "leaky_relu")
        neigh_feature = linear(neigh_feature, hidden_size, "erniesage_v2_r", "leaky_relu")
        output = L.concat([self_feature, neigh_feature], axis=1)
        output = L.l2_normalize(output, axis=1)

        final_feats = [
            self.take_final_feature(output, i, "v2_final_fc") for i in inputs
        ]
        return final_feats

    def take_final_feature(self, feature, index, name):
        """take final feature"""
        feat = L.gather(feature, index, overwrite=False)
        feat = linear(feat, self.config.hidden_size, name)
        feat = L.l2_normalize(feat, axis=1)
        return feat

# 直接run一下
erniesage_v2_encoder = ERNIESageV2Encoder(config)

main_prog, start_prog = F.Program(), F.Program()
with F.program_guard(main_prog, start_prog):
    with F.unique_name.guard():
        num_nodes = L.data("num_nodes", [1], False, 'int64')
        num_edges = L.data("num_edges", [1], False, 'int64')
        edges = L.data("edges", [-1, 2], False, 'int64')
        node_feat = L.data("term_ids", [10, 40], False, 'int64')
        inputs = L.data("inputs", [2], False, 'int64')

        gw = BatchGraphWrapper(num_nodes, num_edges, edges, {"term_ids": node_feat})
        outputs = erniesage_v2_encoder([gw], [inputs])

exe = F.Executor(place)
exe.run(start_prog)
outputs_np = exe.run(main_prog, feed=feed_dict, fetch_list=[outputs])[0]
print(outputs_np)

3、训练
3.1 以一个link predict的任务为例，读取一个语义图，以上面的边为目标进行无监督的训练

class ERNIESageLinkPredictModel(propeller.train.Model):
    def __init__(self, hparam, mode, run_config):
        self.hparam = hparam
        self.mode = mode
        self.run_config = run_config

    def forward(self, features):
        num_nodes, num_edges, edges, node_feat_index, node_feat_term_ids, user_index, \
            pos_item_index, neg_item_index, user_real_index, pos_item_real_index = features

        node_feat = {"index": node_feat_index, "term_ids": node_feat_term_ids}
        graph_wrapper = BatchGraphWrapper(num_nodes, num_edges, edges,
                                          node_feat)

        #encoder = ERNIESageV1Encoder(self.hparam)
        encoder = ERNIESageV2Encoder(self.hparam)
        #encoder = ERNIESageV3Encoder(self.hparam)

        # 中心节点、邻居节点、随机采样节点 分别提取特征
        outputs = encoder([graph_wrapper],
                          [user_index, pos_item_index, neg_item_index])
        user_feat, pos_item_feat, neg_item_feat = outputs
    
        if self.mode is not propeller.RunMode.PREDICT:
            return user_feat, pos_item_feat, neg_item_feat
        else:
            return user_feat, user_real_index

    def loss(self, predictions, labels):
        user_feat, pos_item_feat, neg_item_feat = predictions
        pos = L.reduce_sum(user_feat * pos_item_feat, -1, keep_dim=True) # 
        #neg = L.reduce_sum(user_feat * neg_item_feat, -1, keep_dim=True)# 60.
        neg = L.matmul(user_feat, neg_item_feat, transpose_y=True) # 80.
        # 距离（中心，邻居）> 距离(中心，随机负)
        loss = L.reduce_mean(L.relu(neg - pos + self.hparam.margin))
        return loss

    def backward(self, loss):
        adam = F.optimizer.Adam(learning_rate=self.hparam['learning_rate'])
        adam.minimize(loss)

    def metrics(self, predictions, label):
        return {}

from link_predict import train
from link_predict import predict

train(config, ERNIESageLinkPredictModel)

predict(config, ERNIESageLinkPredictModel)

! head output/part-0

3.2 评价

!python build_dev.py --path "./example_data/link_predict/graph_data.txt" # 此命令用于将训练数据输出为需要的格式，产生的文件为dev_out.txt

# 接下来，计算MRR得分。
# 注意，运行此代码的前提是，我们已经将config对应的yaml配置文件中的input_data参数修改为了："data.txt"
# 并且注意训练的模型是针对data.txt的，如果不符合，请重新训练模型。
!python mrr.py --emb_path output/part-0

学习心得：

通过这次参加百度的图神经网络的7日打卡训练营，收获颇多。最高兴的是这次课程是免费的，云GPU算力也是免费的，还有专业的答疑团队。除了学习知识以外，还有作业实践来巩固，了解了百度的深度学习框架飞浆和对应的图框架PGL。在做作业和比赛的过程中，通过不断调参又提升了炼丹功力。对图神经网络的应用面也有所了解，只有问题能构成一个图结构就可以用图神经网络解决，事实上任何事物之间都可以构成图结构，即万物皆可图！