苏建林DGCNN模型代码详解

 1.说明

以下代码为苏神参加百度信息抽取比赛提出的DGCNN模型代码,其源码基本是没有注释的,所以本文对大部分代码做了注释,可能有不对的地方,可以指出。另一个就是对keras3.x版本下Attention模块的计算做了修改。

2.代码注释

#! -*- coding:utf-8 -*-


from __future__ import print_function

import json
import os
import re
from random import choice
# 关键词匹配工具
import ahocorasick
import numpy as np
# 分词工具
import pyhanlp
from gensim.models import Word2Vec
from tqdm import tqdm
import tensorflow as tf

mode = 0
char_size = 128
maxlen = 512

word2vec = Word2Vec.load('data/word2vec_baike')
# 词典的加载
id2word = {i + 1: j for i, j in enumerate(word2vec.wv.index2word)}
word2id = {j: i for i, j in id2word.items()}
word2vec = word2vec.wv.vectors
word_size = word2vec.shape[1]
# 词向量的处理
word2vec = np.concatenate([np.zeros((1, word_size)), word2vec])


def tokenize(s):
    # 对句子进行分词
    return [i.word for i in pyhanlp.HanLP.segment(s)]


# 相当于word_embedding层
def sent2vec(S):
    """S格式:[[w1, w2,...],...]
    """
    V = []
    for s in S:
        V.append([])
        for w in s:
            for _ in w:
                # 这里的w是一个单词,而不是单字;为了保持对齐,这个单词含多少个字就讲这个词向量重复多少遍
                V[-1].append(word2id.get(w, 0))
    # 对词数据进行pad
    V = seq_padding(V)
    # 将id序列转化为词向量形式,[[wid1,wid2,...],...]
    # (n,len,word_size))
    V = word2vec[V]
    return V


# 数据加载,数据形式为[{'text':'','spo_list':[[s,p,o],...]},...]
total_data = json.load(open('./data/train_data_me.json', encoding='utf-8'))
# id2predicate={'id':p,...}   predicate2id={p:id,...}
id2predicate, predicate2id = json.load(open('./data/all_50_schemas_me.json', encoding='utf-8'))
# id2predicate={id:p,...}
id2predicate = {int(i): j for i, j in id2predicate.items()}
# id2char={'id':word,...}  char2id={word:id,...}
id2char, char2id = json.load(open('./data/all_chars_me.json', encoding='utf-8'))
# 统计关系总量
num_classes = len(id2predicate)
# 保存一个长度为数据量的打乱顺序的列表
if not os.path.exists('data/random_order_vote.json'):
    random_order = [i for i in range(len(total_data))]
    np.random.shuffle(random_order)
    json.dump(
        random_order,
        open('data/random_order_vote.json', 'w', encoding='utf-8'),
        indent=4
    )
else:
    random_order = json.load(open('data/random_order_vote.json', encoding='utf-8'))
# 数据按照8:1的形式进行划分
train_data = [total_data[j] for i, j in enumerate(random_order) if i % 8 != mode]
dev_data = [total_data[j] for i, j in enumerate(random_order) if i % 8 == mode]

predicates = {}  # 格式:{predicate: [(subject, predicate, object)]}


def repair(d):
    # 将句子转为小写形式
    d['text'] = d['text'].lower()
    # 提取书名号中的内容,[some,...]
    something = re.findall(u'《([^《》]*?)》', d['text'])
    something = [s.strip() for s in something]
    zhuanji = []
    gequ = []
    for sp in d['spo_list']:
        # 去掉主语和宾语中的书名号
        sp[0] = sp[0].strip(u'《》').strip().lower()
        sp[2] = sp[2].strip(u'《》').strip().lower()
        # 对提取出的内容
        for some in something:
            if sp[0] in some and d['text'].count(sp[0]) == 1:
                sp[0] = some
        if sp[1] == u'所属专辑':
            zhuanji.append(sp[2])
            gequ.append(sp[0])
    spo_list = []
    for sp in d['spo_list']:
        if sp[1] in [u'歌手', u'作词', u'作曲']:
            if sp[0] in zhuanji and sp[0] not in gequ:
                continue
        spo_list.append(tuple(sp))
    d['spo_list'] = spo_list


# 提取p2spo字典
for d in train_data:
    # 对数据进行修复
    repair(d)
    for sp in d['spo_list']:
        if sp[1] not in predicates:
            predicates[sp[1]] = []
        predicates[sp[1]].append(sp)  # 格式:{predicate: [[subject, predicate, object]],...}

for d in dev_data:
    repair(d)


# 随机替换数据,即对一个句子中的三元组同p类型三元组进行替换
def random_generate(d, spo_list_key):
    r = np.random.random()
    if r > 0.5:  # 若随机数大于0.5则不再打乱数据
        return d
    else:
        # 产生0-len之间的随机数,len(d[spo_list_key])->为一个句子中的三元组个数,也就是说会对句子里的某个三元组进行替换
        k1 = np.random.randint(len(d[spo_list_key]))
        # 得到该条数据的一个随机三元组
        spi = d[spo_list_key][k1]  # ['s','p','o']
        # 从predict词典中随机选取具有相同p的三元组
        k2 = np.random.randint(len(predicates[spi[1]]))
        spo = predicates[spi[1]][k2]  # ['s','p','o']
        #  将数据中的部分s,o进行替换
        F = lambda s: s.replace(spi[0], spo[0]).replace(spi[2], spo[2])
        # 对该句子中的对应的主语和宾语进行替换
        text = F(d['text'])
        # 修正三元组
        spo_list = [(F(sp[0]), sp[1], F(sp[2])) for sp in d[spo_list_key]]
        # 返回随机修改后的数据
        return {'text': text, spo_list_key: spo_list}


# pad数据
def seq_padding(X, padding=None):
    if padding is None:
        padding = 0
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])


class AC_Unicode:
    """稍微封装一下,弄个支持unicode的AC自动机
    """

    def __init__(self):
        self.ac = ahocorasick.Automaton()

    def add_word(self, k, v):
        # k = k.encode('utf-8')
        self.ac.add_word(k, v)

    def make_automaton(self):
        self.ac.make_automaton()

    def iter(self, s):
        # s = s.encode('utf-8')
        return self.ac.iter(s)


# 传入句子和句子的id
class spo_searcher:
    def __init__(self, train_data):
        # 构建主语关键字词自动机
        self.s_ac = AC_Unicode()
        # 构建宾语关键字词自动机
        self.o_ac = AC_Unicode()
        self.so2p = {}
        self.spo_total = {}
        # 对所有的数据提取三元组
        for i, d in enumerate(train_data):
            # train_data的形式应为[{'text':'','spo_list':[['s','p','o']]}]
            for s, p, o in d['spo_list']:
                # 建立自动机字典
                self.s_ac.add_word(s, s)
                self.o_ac.add_word(o, o)
                if (s, o) not in self.so2p:
                    self.so2p[(s, o)] = set()
                if (s, p, o) not in self.spo_total:
                    self.spo_total[(s, p, o)] = set()
                # so2p={(s,o):{p}},对于相同的s o 会有多少种不同的p
                self.so2p[(s, o)].add(p)
                # spo_total={(s,p,o):{i}},表示这种类型的三元组出现在的数据中的id,即出现了多少次
                self.spo_total[(s, p, o)].add(i)
        # 构建自动机树
        self.s_ac.make_automaton()
        self.o_ac.make_automaton()

    # 使用构建的自动机提取某个句子中的三元组,这个自动机包含训练集中所有的s和o关键词字典
    # 对输入的测试集提取句子中的三元组
    def extract_items(self, text_in, text_idx=None):
        R = set()
        # 对输入的句子进行自动机树遍历
        # 寻找主语
        for s in self.s_ac.iter(text_in):
            # 寻找宾语
            for o in self.o_ac.iter(text_in):
                # 对于句子中的每个可能的so,如果存在则构建
                if (s[1], o[1]) in self.so2p:
                    # 对于其中的每个p构建s,p,o三元组,得到结果列表
                    for p in self.so2p[(s[1], o[1])]:
                        # None stand for?
                        if text_idx is None:
                            # R->{('s','p','o')}
                            R.add((s[1], p, o[1]))
                        # {id1,id2,...}-{idi}
                        elif self.spo_total[(s[1], p, o[1])] - {text_idx}:
                            R.add((s[1], p, o[1]))
        # 返回根据自动机关键词典提取到的句子中的三元组
        # R={(s,p,o)}->[(s,p,o)]
        return list(R)


# 实例化
spoer = spo_searcher(train_data)


class data_generator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            # 创建长度为数据量的无序列表用以打乱数据
            idxs = [i for i in range(len(self.data))]
            np.random.shuffle(idxs)
            # 创建10个列表,不知道干啥
            T1, T2, S1, S2, K1, K2, O1, O2, PRES, PREO = [], [], [], [], [], [], [], [], [], []
            for i in idxs:
                spo_list_key = 'spo_list'  # if np.random.random() > 0.5 else 'spo_list_with_pred'
                # 重构数据,相当于数据增强
                d = random_generate(self.data[i], spo_list_key)  # 对数据进行随机替换
                # 对句子长度进行切分
                text = d['text'][:maxlen]
                # 分词操作
                text_words = tokenize(text)
                # 对分词后的句子拼接为字符串,还原句子
                text = ''.join(text_words)
                # items保存了句子的s,o的初始位置以及其长度
                items = {}
                # [[s,p,o],...]
                for sp in d[spo_list_key]:
                    # 在对应句子中寻找主语和宾语的位置
                    subjectid = text.find(sp[0])
                    objectid = text.find(sp[2])
                    # 不存在为-1
                    if subjectid != -1 and objectid != -1:
                        # (sid,sid+slen)
                        key = (subjectid, subjectid + len(sp[0]))
                        if key not in items.keys():
                            items[key] = []
                        # {(sid,sid+slen):[(oid,oid+olen,pid),...]}
                        items[key].append((objectid,
                                           objectid + len(sp[2]),
                                           predicate2id[sp[1]]))  # predicate2id={p:id,...}
                # 利用自动机再提取
                pre_items = {}
                # 根据自动机提取出该text中存在的三元组,# R->[('s','p','o')]
                for sp in spoer.extract_items(text, i):
                    subjectid = text.find(sp[0])
                    objectid = text.find(sp[2])
                    if subjectid != -1 and objectid != -1:
                        key = (subjectid, subjectid + len(sp[0]))
                        if key not in pre_items:
                            pre_items[key] = []
                        # {(sid,sid+slen):[(oid,oid+olen,pid)]}
                        pre_items[key].append((objectid,
                                               objectid + len(sp[2]),
                                               predicate2id[sp[1]]))
                # 如果items不为空,即该句子包含三元组
                if items:
                    # 将文本转为字符id的形式(已经取过最大值),这里的text是修正后的text
                    # T1->[[],...]
                    T1.append([char2id.get(c, 1) for c in text])  # 1是unk,0是padding
                    # 分词的句子,T2->[[],...]
                    T2.append(text_words)
                    # s1,s2=[0,0,...]
                    s1, s2 = np.zeros(len(text)), np.zeros(len(text))
                    # items={(sid,sid+slen):[(oid,oid+olen,pid),...]}
                    # 标注句子中s的位置
                    for j in items:
                        # j[0]会得到键的第一个值,即s的首位置
                        # s1向量标记s的首位置
                        s1[j[0]] = 1
                        # s2向量标记s的尾位置
                        s2[j[1] - 1] = 1
                    # pres=[[0,0],[0,0],...],竖着标记s出现的位置,第一列为s的首位置,第二列标记s的尾位置
                    pres = np.zeros((len(text), 2))
                    # pre_items结构与items一样
                    for j in pre_items:
                        pres[j[0], 0] = 1
                        pres[j[1] - 1, 1] = 1
                    # 对items的键转为数组,包含该句中存在的所有s的首尾位置
                    a = []
                    for e in items.keys():
                        a.append(e)
                    # a->[(sid,sid+slen),...]
                    # k1=[s1首,...],k2=[s1尾,...]
                    k1, k2 = np.array(a).T
                    # 随机选取一个s的首位置
                    k1 = choice(k1)
                    # 从k1后的位置随机选取一个s尾,如果该句只含有一个三元组就必定选中
                    # k2>=k1=[False,...,True,...],这个向量可以把另一个等长的向量中值为True的位置的值提取出来
                    # 即k2不会是k1标记的实体的前面的实体,只可能为当前实体或其之后的实体尾位置
                    k2 = choice(k2[k2 >= k1])
                    # num_classes为p的个数,o1->(len,num),这样在固定的列上标记o的位置,不仅表示出了o还表示出了关系
                    o1, o2 = np.zeros((len(text), num_classes)), np.zeros((len(text), num_classes))
                    # 这的j是指返回的列表的元素,那么如果为空列表,自然不会对列表内容进行遍历,也就是说该遍历并不进行
                    # 不存在错误
                    for j in items.get((k1, k2), []):
                        # j->(os,oe,pid)
                        o1[j[0], j[2]] = 1
                        o2[j[1] - 1, j[2]] = 1
                    # preo->(len,num,2)
                    # 对于每个字标记,这里用另一种方式对宾语的位置及类别进行了标记,目前暂不清楚作用
                    preo = np.zeros((len(text), num_classes, 2))
                    for j in pre_items.get((k1, k2), []):
                        preo[j[0], j[2], 0] = 1
                        preo[j[1] - 1, j[2], 1] = 1
                    # preo->(len,num*2)
                    preo = preo.reshape((len(text), -1))
                    # S1保存每个句子的s开始01标签,S1->[[],...]
                    S1.append(s1)
                    # S2保存每个句子的s结束01标签,S2->[[],...]
                    S2.append(s2)
                    # K1保存随机提取的首位置,K1->[[k1],[k1],...]
                    K1.append([k1])
                    # K2保存随机提取的尾位置,k2->[[k2],[k2],...]
                    K2.append([k2 - 1])
                    # [[len(text),num_class]]
                    # O1保存宾语的首位置和关系p
                    O1.append(o1)
                    # O2保存宾语的尾位置和关系p
                    O2.append(o2)
                    # pres=[[0,0],[0,0],...],竖着标记s出现的位置
                    PRES.append(pres)
                    # preo->(len,num*2)
                    PREO.append(preo)
                    # pad所有数据,生成一个batch的数据并返回
                    if len(T1) == self.batch_size or i == idxs[-1]:
                        # T1字符id序列
                        T1 = seq_padding(T1)
                        # T2词列表
                        T2 = sent2vec(T2)
                        # S1是s首序列
                        S1 = seq_padding(S1)
                        # S2是s尾序列
                        S2 = seq_padding(S2)
                        #
                        O1 = seq_padding(O1, np.zeros(num_classes))
                        O2 = seq_padding(O2, np.zeros(num_classes))
                        #
                        K1, K2 = np.array(K1), np.array(K2)
                        #
                        PRES = seq_padding(PRES, np.zeros(2))
                        PREO = seq_padding(PREO, np.zeros(num_classes * 2))
                        #
                        yield [T1, T2, S1, S2, K1, K2, O1, O2, PRES, PREO], None
                        T1, T2, S1, S2, K1, K2, O1, O2, PRES, PREO = [], [], [], [], [], [], [], [], [], []


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam


# 即从句子的中抽取出采样到的s实体对应的词向量序列
def seq_gather(x):
    """seq是[None, seq_len, s_size]的格式,
    idxs是[None, 1]的格式,在seq的第i个序列中选出第idxs[i]个向量,
    最终输出[None, s_size]的向量。
    """
    seq, idxs = x
    idxs = K.cast(idxs, 'int32')
    # 原则上讲这里shape(seq)[0]=None,肯定是不能作为参数传入的
    batch_idxs = K.arange(0, K.shape(seq)[0])  # (batch,)
    # (batch,1)
    batch_idxs = K.expand_dims(batch_idxs, 1)
    # (batch,2)
    idxs = K.concatenate([batch_idxs, idxs], 1)
    # 根据传入的idx提取每个batch(每个句子)中一个词向量
    # 按照idxs从seq中抽取切片,idxs中每一行代表了每一个句子的s的首位置词向量表示,return=(batch,char_size)
    return tf.gather_nd(seq, idxs)


def seq_maxpool(x):
    """seq是[None, seq_len, s_size]的格式,
    mask是[None, seq_len, 1]的格式,先除去mask部分,
    然后再做maxpooling。
    """
    seq, mask = x
    # mask
    seq -= (1 - mask) * 1e10
    # 取出每一列的最大值,return=(batch,1,char_size)
    return K.max(seq, 1, keepdims=True)


def dilated_gated_conv1d(seq, mask, dilation_rate=1):
    """膨胀门卷积(残差式)
    """
    # seq=(batch,len,char_size),mask=(batch,len,1)
    dim = K.int_shape(seq)[-1]
    # h=(batch,len,char_size*2)
    h = Conv1D(dim * 2, 3, padding='same', dilation_rate=dilation_rate)(seq)

    def _gate(x):
        dropout_rate = 0.1
        s, h = x
        # g=h=(batch,len,char_size)
        g, h = h[:, :, :dim], h[:, :, dim:]
        # k.in_train_phase(x,y)训练阶段返回x,其他阶段返回y,在训练阶段加入噪声
        g = K.in_train_phase(K.dropout(g, dropout_rate), g)
        # sigmoid激活
        g = K.sigmoid(g)
        # g,s点乘,残差连接机制
        return g * s + (1 - g) * h

    # seq=(batch,len,char_size)
    seq = Lambda(_gate)([seq, h])
    seq = Lambda(lambda x: x[0] * x[1])([seq, mask])
    return seq


class Attention(Layer):
    """多头注意力机制
    """

    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.out_dim = nb_head * size_per_head
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        super(Attention, self).build(input_shape)
        # 输入三个数据
        q_in_dim = input_shape[0][-1]
        k_in_dim = input_shape[0][-1]
        v_in_dim = input_shape[0][-1]
        self.q_kernel = self.add_weight(name='q_kernel',
                                        shape=(q_in_dim, self.out_dim),
                                        initializer='glorot_normal')
        self.k_kernel = self.add_weight(name='k_kernel',
                                        shape=(k_in_dim, self.out_dim),
                                        initializer='glorot_normal')
        self.v_kernel = self.add_weight(name='w_kernel',
                                        shape=(v_in_dim, self.out_dim),
                                        initializer='glorot_normal')

    def mask(self, x, mask, mode='mul'):
        # mask=(batch,len,1)
        if mask is None:
            return x
        else:
            # ndim()返回数据的维数,x=4,mask=3;mask=(batch,len,1,1)
            for _ in range(K.ndim(x) - K.ndim(mask)):
                mask = K.expand_dims(mask, K.ndim(mask))
            if mode == 'mul':
                return x * mask
            else:
                return x - (1 - mask) * 1e10

    def call(self, inputs, **kwargs):
        q, k, v = inputs[:3]
        v_mask, q_mask = None, None
        if len(inputs) > 3:
            v_mask = inputs[3]
            if len(inputs) > 4:
                q_mask = inputs[4]
        # 线性变换,qw=kw=vw=(batch,len,out_dim=head_num*head_dim)
        qw = K.dot(q, self.q_kernel)
        kw = K.dot(k, self.k_kernel)
        vw = K.dot(v, self.v_kernel)
        self.xlen = K.shape(qw)[1]
        # 形状变换,qw=kw=vw=(batch,len,head_num,head_dim)
        qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
        kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
        vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
        # 维度置换,qw=kw=vw=(batch,head_num,len,head_dim)
        qw = K.permute_dimensions(qw, (0, 2, 1, 3))
        kw = K.permute_dimensions(kw, (0, 2, 1, 3))
        vw = K.permute_dimensions(vw, (0, 2, 1, 3))
        # 由于keras版本问题,对三个矩阵进行reshape=(batch*head_num,len,head_num)
        qw = K.reshape(qw, (-1, K.shape(qw)[-2], K.shape(qw)[-1]))
        kw = K.reshape(kw, (-1, K.shape(kw)[-2], K.shape(kw)[-1]))
        vw = K.reshape(vw, (-1, K.shape(vw)[-2], K.shape(vw)[-1]))
        # Attention,a=(batch,head_num,len,len) # a=(batch*head_num,len,len)
        a = K.batch_dot(qw, kw, axes=[2, 2]) / self.size_per_head ** 0.5
        # a=(batch,len,len,head_num),由于版本问题需要还原shape
        a = K.reshape(a, (-1, self.nb_head, self.xlen, self.xlen))
        a = K.permute_dimensions(a, (0, 3, 2, 1))
        a = self.mask(a, v_mask, 'add')
        # a=(batch,head_num,len,len)
        a = K.permute_dimensions(a, (0, 3, 2, 1))
        # 对注意力分数进行归一化
        a = K.softmax(a)
        # a=(batch*head_num,len,len),vw=(batch*head_num,len,head_dim)-->o=(batch*head_num,len,head_dim)
        a = K.reshape(a, [-1, self.xlen, self.xlen])
        vw = K.reshape(vw, [-1, self.xlen, self.size_per_head])
        o = K.batch_dot(a, vw, axes=[2, 1])
        # o = K.batch_dot(a, vw, [3, 2])
        # o=(batch,head_num,len,head_dim)
        o = K.reshape(o, [-1, self.nb_head, self.xlen, self.size_per_head])
        #  o=(batch,len,head_num,head_dim)
        o = K.permute_dimensions(o, (0, 2, 1, 3))
        # o=(batch,len,head_num*head_dim)
        o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
        o = self.mask(o, q_mask, 'mul')
        return o

    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.out_dim)


# T1字符id序列
t1_in = Input(shape=(None,))
# T2词序列
t2_in = Input(shape=(None, word_size))
# S1保存每个句子的s开始01标签
s1_in = Input(shape=(None,))
# S2保存每个句子的s结束01标签
s2_in = Input(shape=(None,))
# K1保存首位置
k1_in = Input(shape=(1,))
# K2保存尾位置
k2_in = Input(shape=(1,))
# O1保存宾语的首位置和关系p
o1_in = Input(shape=(None, num_classes))
# O2保存宾语的尾位置和关系p
o2_in = Input(shape=(None, num_classes))
# pres=[[0,0],[0,0],...],竖着标记s出现的位置
pres_in = Input(shape=(None, 2))
# preo->(len,num*2)
preo_in = Input(shape=(None, num_classes * 2))

t1, t2, s1, s2, k1, k2, o1, o2, pres, preo = t1_in, t2_in, s1_in, s2_in, k1_in, k2_in, o1_in, o2_in, pres_in, preo_in
# 根据t1得到mask,t1为字符id序列
# k.greater(x,y),x与y进行比较,x>y=1,x<y=0;mask->(batch,len,1)
mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(t1)


def position_id(x):
    # 这里是在提取到s后,用作相对位置编码使用,传入t和k1,k2;t=(batch,len,char_size),k1=k2=(batch,1)
    if isinstance(x, list) and len(x) == 2:
        x, r = x
    else:
        r = 0
    # k.int_shape(),pid=[0,1,...,len-1]
    pid = K.arange(K.shape(x)[1])
    # (1,len)
    pid = K.expand_dims(pid, 0)
    # 同纬度扩张数据的倍数,pid=(batch,len)
    pid = K.tile(pid, [K.shape(x)[0], 1])
    # 求解句子中每个位置相对于实体的相对位置=(batch,len)-0
    return K.abs(pid - K.cast(r, 'int32'))


# t1为张量,且shape=(batch,len),pid=(batch,len)
pid = Lambda(position_id)(t1)
# 创建位置编码层,初始值为0
position_embedding = Embedding(maxlen, char_size, embeddings_initializer='zeros')
# 位置编码,pv=(batch,len,char_size)
pv = position_embedding(pid)
# 创建字向量编码层,t1=(batch,len,char_size)
t1 = Embedding(len(char2id) + 2, char_size)(t1)  # 0: padding, 1: unk
# t2=(batch,len,char_size)
t2 = Dense(char_size, use_bias=False)(t2)  # 词向量也转为同样维度
t = Add()([t1, t2, pv])  # 字向量、词向量、位置向量相加
t = Dropout(0.25)(t)
# mask=(batch,len,1),这样会将pad的数据转为0
t = Lambda(lambda x: x[0] * x[1])([t, mask])
# 已经mask过了,还mask?;t=(batch,len,char_size);mask=(batch,len,1);t=(batch,len,char_size)也就是说共享编码阶段的维度为char_size
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 2)
t = dilated_gated_conv1d(t, mask, 5)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 2)
t = dilated_gated_conv1d(t, mask, 5)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 2)
t = dilated_gated_conv1d(t, mask, 5)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 1)
######################################################################前面为共享编码阶段,维度为(batch,len,char_size)
# t_dim=char_size
t_dim = K.int_shape(t)[-1]
####################################################################加入全局信息#全局信息由编码器得到
pn1 = Dense(char_size, activation='relu')(t)
# pn1->(batch,len,1)
pn1 = Dense(1, activation='sigmoid')(pn1)
pn2 = Dense(char_size, activation='relu')(t)
# pn2->(batch,len,1)
pn2 = Dense(1, activation='sigmoid')(pn2)
####################################################################全局信息
###############################################################################开始抽取s
# 自注意力机制
h = Attention(8, 16)([t, t, t, mask])
# 加入先验特征,t=(batch,len,char_size),pres=(batch,len,2),h=(batch,len,128)
h = Concatenate()([t, h, pres])
# (batch,len,char_size)
h = Conv1D(char_size, 3, activation='relu', padding='same')(h)
# 得到句中s的标注位置(batch,len,1)
ps1 = Dense(1, activation='sigmoid')(h)
ps2 = Dense(1, activation='sigmoid')(h)
# 利用全局信息,ps1=ps2=(batch,len,1)
ps1 = Lambda(lambda x: x[0] * x[1])([ps1, pn1])
ps2 = Lambda(lambda x: x[0] * x[1])([ps2, pn2])
# 主语即s预测模型:输入:字向量t1_in,词向量t2_in,先验特征pres_in;输出:s的指针序列,ps1首指针序列,ps2尾指针序列->(batch,len,1)
subject_model = Model([t1_in, t2_in, pres_in], [ps1, ps2])  # 预测subject的模型
####################################################################### 构建pc全局信息
# t_max=(batch,1,char_size),对编码层表示进行mask+maxpooling
t_max = Lambda(seq_maxpool)([t, mask])
pc = Dense(char_size, activation='relu')(t_max)
# pc=(batch,1,num_classes),全局关系检测模块
pc = Dense(num_classes, activation='sigmoid')(pc)


# 根据固定长度6提取实体
def get_k_inter(x, n=6):
    # seq->(batch,len,char_size),k1,k2->(batch,1)
    seq, k1, k2 = x
    # k.round()->对数据四舍五入,k_inter[(batch,1),...],a=[0,0.1,0.2,...]
    k_inter = [K.round(k1 * a + k2 * (1 - a)) for a in np.arange(n) / (n - 1.)]
    # k->(batch,1),k_inter=[(batch,char_size),...]
    k_inter = [seq_gather([seq, k]) for k in k_inter]
    # k_inter=[(batch,1,char_size),...]
    k_inter = [K.expand_dims(k, 1) for k in k_inter]
    # k_inter=(batch,6,char_size)
    k_inter = K.concatenate(k_inter, 1)
    return k_inter


#
# k1=k2=(batch,1)
# t为共享编码阶段得到的句子的词向量表示,输出维度为(batch,6,char_size);k就是提取到的s的词向量表示,其被pad到6
k = Lambda(get_k_inter, output_shape=(6, t_dim))([t, k1, k2])
# 对提取到的k进行rnn编码->(batch,char_size*2) # 因为这里只返回了rnn的最后状态作为计算
k = Bidirectional(GRU(t_dim))(k)
# t=(batch,len,char_size);k1=k2=(batch,1);k1v=k2v=(batch,len,char_size);Lambda(position_id)([t, k1])->(batch,len)
k1v = position_embedding(Lambda(position_id)([t, k1]))
k2v = position_embedding(Lambda(position_id)([t, k2]))
# kv=(batch,len,2*char_size)
kv = Concatenate()([k1v, k2v])
# k=(batch,1,char_size*2)->(batch,len,char_size*2)
k = Lambda(lambda x: K.expand_dims(x[0], 1) + x[1])([k, kv])
# attention2,h=(batch,len,128)
h = Attention(8, 16)([t, t, t, mask])
# 对多种特征进行拼接,# preo->(batch,len,num*2),pres->(batch,len,2)
h = Concatenate()([t, h, k, pres, preo])
# h->(batch,len,char_size)
h = Conv1D(char_size, 3, activation='relu', padding='same')(h)
# 全局信息 po->(batch,len,1)
po = Dense(1, activation='sigmoid')(h)
# 提取o首指针 po1->(batch,len,num)
po1 = Dense(num_classes, activation='sigmoid')(h)
# 提取o尾指针 po2->(batch,len,num)
po2 = Dense(num_classes, activation='sigmoid')(h)
# 加上全局信息;po=(batch,len,1),po1=po2=(batch,len,num),pc=(batch,1,num),pn1=pn2=(batch,len,1)
po1 = Lambda(lambda x: x[0] * x[1] * x[2] * x[3])([po, po1, pc, pn1])
po2 = Lambda(lambda x: x[0] * x[1] * x[2] * x[3])([po, po2, pc, pn2])
# 宾语预测模型
object_model = Model([t1_in, t2_in, k1_in, k2_in, pres_in, preo_in], [po1, po2])  # 输入text和subject,预测object及其关系
# 主训练模型
train_model = Model([t1_in, t2_in, s1_in, s2_in, k1_in, k2_in, o1_in, o2_in, pres_in, preo_in],
                    [ps1, ps2, po1, po2])

# s1=(batch,len)->(batch,len,1)
s1 = K.expand_dims(s1, 2)
s2 = K.expand_dims(s2, 2)
# 计算s的交叉熵损失 s1_loss=(batch,len,1)
s1_loss = K.binary_crossentropy(s1, ps1)
s1_loss = K.sum(s1_loss * mask) / K.sum(mask)
s2_loss = K.binary_crossentropy(s2, ps2)
s2_loss = K.sum(s2_loss * mask) / K.sum(mask)
# 计算o的交叉熵损失
o1_loss = K.sum(K.binary_crossentropy(o1, po1), 2, keepdims=True)
o1_loss = K.sum(o1_loss * mask) / K.sum(mask)
o2_loss = K.sum(K.binary_crossentropy(o2, po2), 2, keepdims=True)
o2_loss = K.sum(o2_loss * mask) / K.sum(mask)
# 总损失
loss = (s1_loss + s2_loss) + (o1_loss + o2_loss)
# 添加损失
train_model.add_loss(loss)
# 编译模型
train_model.compile(optimizer=Adam(1e-3))
train_model.summary()


class ExponentialMovingAverage:
    """对模型权重进行指数滑动平均。
    用法:在model.compile之后、第一次训练之前使用;
    先初始化对象,然后执行inject方法。
    """

    # k.set_value(x,value)->x需要是一个变量:tf.Variable(),k.zeros()等创建的量为Variable;value为一个numpy的数组
    def __init__(self, model, momentum=0.9999):
        self.momentum = momentum
        self.model = model
        # 得到模型权重的零矩阵列表,模型的权重以列表的形式保存每个层的权重;且权重的形式为numpy数组的形式返回
        # self.ema_weights=[[]->variable,[],...]
        # model.weights->[[]->variable,...]
        # k.get_value()->将一个tensor转换为numpy数组返回;k.batch_get_value()->将一个batch型tensor对每个batch作为单独的numpy数组
        # 并共同构成一个列表返回
        # model.weights直接返回的是variable的列表;k.batch_get_value()返回的是numpy数组形式的列表;
        # k.get_value()只能操作tensor或variable,这里的模型权重为列表,所以不能用,并不是因为模型权重参数有batch特征
        self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights]

    def inject(self):
        """添加更新算子到model.metrics_updates。
        """
        self.initialize()
        for w1, w2 in zip(self.ema_weights, self.model.weights):
            op = K.moving_average_update(w1, w2, self.momentum)
            self.model.add_metric(op)

    def initialize(self):
        """ema_weights初始化跟原模型初始化一致。
        """
        # 得到权重numpy矩阵的列表形式
        self.old_weights = K.batch_get_value(self.model.weights)
        # 对初始化的矩阵进行赋值
        K.batch_set_value(zip(self.ema_weights, self.old_weights))

    def apply_ema_weights(self):
        """备份原模型权重,然后将平均权重应用到模型上去。
        """
        self.old_weights = K.batch_get_value(self.model.weights)
        ema_weights = K.batch_get_value(self.ema_weights)
        K.batch_set_value(zip(self.model.weights, ema_weights))

    def reset_old_weights(self):
        """恢复模型到旧权重。
        """
        K.batch_set_value(zip(self.model.weights, self.old_weights))


EMAer = ExponentialMovingAverage(train_model)
EMAer.inject()


# 对验证集进行提取
def extract_items(text_in):
    # 分词操作
    text_words = tokenize(text_in.lower())
    text_in = ''.join(text_words)
    pre_items = {}
    # 利用自动机提取该句子中的三元组,返回形式为=[(s,p,o),...]
    for sp in spoer.extract_items(text_in):
        # 寻找s和o在句子中的位置,不存在则返回-1
        subjectid = text_in.find(sp[0])
        objectid = text_in.find(sp[2])
        if subjectid != -1 and objectid != -1:
            key = (subjectid, subjectid + len(sp[0]))
            if key not in pre_items:
                pre_items[key] = []
            # {(s,s+len):[(o,o+len,pid),...],...}
            pre_items[key].append((objectid,
                                   objectid + len(sp[2]),
                                   predicate2id[sp[1]]))
    # _pres=(len,2)
    _pres = np.zeros((len(text_in), 2))
    # 标记提取到的s
    for j in pre_items:
        _pres[j[0], 0] = 1
        _pres[j[1] - 1, 1] = 1
    # _pres=(1,len,2)
    _pres = np.expand_dims(_pres, 0)
    #
    R = []
    # _t1为句子的id序列
    _t1 = [char2id.get(c, 1) for c in text_in]
    # 转化为numpy数组,(len,)
    _t1 = np.array([_t1])
    # 对句子的词序列进行embedding,(len,char_size)
    _t2 = sent2vec([text_words])
    # 得到预测的s的首尾位置,_k1=_k2=(1,len,1),只传入一组数据
    _k1, _k2 = subject_model.predict([_t1, _t2, _pres])
    # 取出数据的预测结果,_k1=_k2=(len,1)
    _k1, _k2 = _k1[0, :, 0], _k2[0, :, 0]
    # np.where(condition)返回满足条件的索引;这里取出第一个可能值大于0.5的索引位置,尾位置值设为0.4
    # _k1,_k为一个满足条件的索引的numpy数组
    _k1, _k2 = np.where(_k1 > 0.5)[0], np.where(_k2 > 0.4)[0]
    # 初始化两个列表
    _subjects, _PREO = [], []
    # 对_k1中每一个索引位置
    for i in _k1:
        # 取出_k2中索引位于该索引值后的索引,因为s的尾指针肯定在其首指针后
        j = _k2[_k2 >= i]
        # 如果存在s实体
        if len(j) > 0:
            # 取出第一个可能的尾指针
            j = j[0]
            # 从原句中取出首尾指针指向的实体;
            _subject = text_in[i: j + 1]
            # 添加实体组=(实体,实体首位置,实体尾位置)
            _subjects.append((_subject, i, j))
            # _preo=(len,num,2)
            _preo = np.zeros((len(text_in), num_classes, 2))
            # 对自动机提取到的三元组,为宾语矩阵赋值
            for _ in pre_items.get((i, j + 1), []):  # [(oid,oid+len(o),pid),...]
                _preo[_[0], _[2], 0] = 1
                _preo[_[1] - 1, _[2], 1] = 1
            # _preo=(len,num_class*2)
            _preo = _preo.reshape((len(text_in), -1))
            # _PREO=[[len,num_class*2],...]
            _PREO.append(_preo)
    # 如果提取到了实体
    if _subjects:
        # _pres标记了s的首尾指针 _pres=(1,len,2)->(len(subject),len,2)
        _PRES = np.repeat(_pres, len(_subjects), 0)
        # (n,len,num_class*2)
        _PREO = np.array(_PREO)
        # _t1->(len(sub)*len(text),)
        _t1 = np.repeat(_t1, len(_subjects), 0)
        # _t2->(len(word)*len(sub),word_size)?
        _t2 = np.repeat(_t2, len(_subjects), 0)
        # 取到s的索引值=[[i,j],...]->(n,2)->(2,n)->(2,n,1);k1保存首位置,k2保存尾指针->(1,n,1)
        _k1, _k2 = np.array([_s[1:] for _s in _subjects]).T.reshape((2, -1, 1))
        # 宾语模型得到宾语的位置,o1=o2=(len,num_class)
        _o1, _o2 = object_model.predict([_t1, _t2, _k1, _k2, _PRES, _PREO])
        # _subjects=[(实体,i,j),...]
        for i, _subject in enumerate(_subjects):
            _oo1, _oo2 = np.where(_o1[i] > 0.5), np.where(_o2[i] > 0.4)
            for _ooo1, _c1 in zip(*_oo1):
                for _ooo2, _c2 in zip(*_oo2):
                    if _ooo1 <= _ooo2 and _c1 == _c2:
                        _object = text_in[_ooo1: _ooo2 + 1]
                        _predicate = id2predicate[_c1]
                        R.append((_subject[0], _predicate, _object))
                        break
        zhuanji, gequ = [], []
        for s, p, o in R[:]:
            if p == u'妻子':
                R.append((o, u'丈夫', s))
            elif p == u'丈夫':
                R.append((o, u'妻子', s))
            if p == u'所属专辑':
                zhuanji.append(o)
                gequ.append(s)
        spo_list = set()
        for s, p, o in R:
            if p in [u'歌手', u'作词', u'作曲']:
                if s in zhuanji and s not in gequ:
                    continue
            spo_list.add((s, p, o))
        return list(spo_list)
    else:
        return []


class Evaluate(Callback):
    def __init__(self):
        super(Evaluate, self).__init__()
        self.F1 = []
        self.best = 0.
        self.passed = 0
        self.stage = 0

    def on_batch_begin(self, batch, logs=None):
        """第一个epoch用来warmup,不warmup有不收敛的可能。
        """
        # steps->一个epoch的总步数
        if self.passed < self.params['steps']:
            lr = (self.passed + 1.) / self.params['steps'] * 1e-3
            K.set_value(self.model.optimizer.lr, lr)
            self.passed += 1

    def on_epoch_end(self, epoch, logs=None):
        # 一个epoch结束则将滑动后的模型权重赋值给模型权重
        EMAer.apply_ema_weights()
        #
        f1, precision, recall = self.evaluate()
        self.F1.append(f1)
        if f1 > self.best:
            self.best = f1
            train_model.save_weights('best_model.weights')
        print('f1: %.4f, precision: %.4f, recall: %.4f, best f1: %.4f\n' % (f1, precision, recall, self.best))
        # 对权重进行恢复
        EMAer.reset_old_weights()
        if epoch + 1 == 50 or (
                self.stage == 0 and epoch > 10 and
                (f1 < 0.5 or np.argmax(self.F1) < len(self.F1) - 8)
        ):
            self.stage = 1
            train_model.load_weights('best_model.weights')
            EMAer.initialize()
            K.set_value(self.model.optimizer.lr, 1e-4)
            K.set_value(self.model.optimizer.iterations, 0)
            opt_weights = K.batch_get_value(self.model.optimizer.weights)
            opt_weights = [w * 0. for w in opt_weights]
            K.batch_set_value(zip(self.model.optimizer.weights, opt_weights))

    def evaluate(self):
        orders = ['subject', 'predicate', 'object']
        #
        A, B, C = 1e-10, 1e-10, 1e-10
        F = open('dev_pred.json', 'w')
        for d in tqdm(iter(dev_data)):
            # R=[(s,p,o),...] 提取句子中的三元组
            R = set(extract_items(d['text']))
            T = set(d['spo_list'])
            A += len(R & T)
            B += len(R)
            C += len(T)
            s = json.dumps({
                'text': d['text'],
                'spo_list': [
                    dict(zip(orders, spo)) for spo in T
                ],
                'spo_list_pred': [
                    dict(zip(orders, spo)) for spo in R
                ],
                'new': [
                    dict(zip(orders, spo)) for spo in R - T
                ],
                'lack': [
                    dict(zip(orders, spo)) for spo in T - R
                ]
            }, ensure_ascii=False, indent=4)
            F.write(s.encode('utf-8') + '\n')
        F.close()
        # 返回三个值
        return 2 * A / (B + C), A / B, A / C


def test(test_data):
    """输出测试结果
    """
    orders = ['subject', 'predicate', 'object', 'object_type', 'subject_type']
    F = open('test_pred.json', 'w')
    for d in tqdm(iter(test_data)):
        R = set(extract_items(d['text']))
        s = json.dumps({
            'text': d['text'],
            'spo_list': [
                dict(zip(orders, spo + ('', ''))) for spo in R
            ]
        }, ensure_ascii=False)
        F.write(s.encode('utf-8') + '\n')
    F.close()


train_D = data_generator(train_data)
evaluator = Evaluate()

if __name__ == '__main__':
    train_model.fit_generator(train_D.__iter__(),
                              steps_per_epoch=len(train_D),
                              epochs=120,
                              callbacks=[evaluator]
                              )
    train_model.add_metric()
else:
    train_model.load_weights('best_model.weights')

 

  • 3
    点赞
  • 28
    收藏
    觉得还不错? 一键收藏
  • 8
    评论
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值