1.说明
以下代码为苏神参加百度信息抽取比赛提出的DGCNN模型代码,其源码基本是没有注释的,所以本文对大部分代码做了注释,可能有不对的地方,可以指出。另一个就是对keras3.x版本下Attention模块的计算做了修改。
2.代码注释
#! -*- coding:utf-8 -*-
from __future__ import print_function
import json
import os
import re
from random import choice
# 关键词匹配工具
import ahocorasick
import numpy as np
# 分词工具
import pyhanlp
from gensim.models import Word2Vec
from tqdm import tqdm
import tensorflow as tf
mode = 0
char_size = 128
maxlen = 512
word2vec = Word2Vec.load('data/word2vec_baike')
# 词典的加载
id2word = {i + 1: j for i, j in enumerate(word2vec.wv.index2word)}
word2id = {j: i for i, j in id2word.items()}
word2vec = word2vec.wv.vectors
word_size = word2vec.shape[1]
# 词向量的处理
word2vec = np.concatenate([np.zeros((1, word_size)), word2vec])
def tokenize(s):
# 对句子进行分词
return [i.word for i in pyhanlp.HanLP.segment(s)]
# 相当于word_embedding层
def sent2vec(S):
"""S格式:[[w1, w2,...],...]
"""
V = []
for s in S:
V.append([])
for w in s:
for _ in w:
# 这里的w是一个单词,而不是单字;为了保持对齐,这个单词含多少个字就讲这个词向量重复多少遍
V[-1].append(word2id.get(w, 0))
# 对词数据进行pad
V = seq_padding(V)
# 将id序列转化为词向量形式,[[wid1,wid2,...],...]
# (n,len,word_size))
V = word2vec[V]
return V
# 数据加载,数据形式为[{'text':'','spo_list':[[s,p,o],...]},...]
total_data = json.load(open('./data/train_data_me.json', encoding='utf-8'))
# id2predicate={'id':p,...} predicate2id={p:id,...}
id2predicate, predicate2id = json.load(open('./data/all_50_schemas_me.json', encoding='utf-8'))
# id2predicate={id:p,...}
id2predicate = {int(i): j for i, j in id2predicate.items()}
# id2char={'id':word,...} char2id={word:id,...}
id2char, char2id = json.load(open('./data/all_chars_me.json', encoding='utf-8'))
# 统计关系总量
num_classes = len(id2predicate)
# 保存一个长度为数据量的打乱顺序的列表
if not os.path.exists('data/random_order_vote.json'):
random_order = [i for i in range(len(total_data))]
np.random.shuffle(random_order)
json.dump(
random_order,
open('data/random_order_vote.json', 'w', encoding='utf-8'),
indent=4
)
else:
random_order = json.load(open('data/random_order_vote.json', encoding='utf-8'))
# 数据按照8:1的形式进行划分
train_data = [total_data[j] for i, j in enumerate(random_order) if i % 8 != mode]
dev_data = [total_data[j] for i, j in enumerate(random_order) if i % 8 == mode]
predicates = {} # 格式:{predicate: [(subject, predicate, object)]}
def repair(d):
# 将句子转为小写形式
d['text'] = d['text'].lower()
# 提取书名号中的内容,[some,...]
something = re.findall(u'《([^《》]*?)》', d['text'])
something = [s.strip() for s in something]
zhuanji = []
gequ = []
for sp in d['spo_list']:
# 去掉主语和宾语中的书名号
sp[0] = sp[0].strip(u'《》').strip().lower()
sp[2] = sp[2].strip(u'《》').strip().lower()
# 对提取出的内容
for some in something:
if sp[0] in some and d['text'].count(sp[0]) == 1:
sp[0] = some
if sp[1] == u'所属专辑':
zhuanji.append(sp[2])
gequ.append(sp[0])
spo_list = []
for sp in d['spo_list']:
if sp[1] in [u'歌手', u'作词', u'作曲']:
if sp[0] in zhuanji and sp[0] not in gequ:
continue
spo_list.append(tuple(sp))
d['spo_list'] = spo_list
# 提取p2spo字典
for d in train_data:
# 对数据进行修复
repair(d)
for sp in d['spo_list']:
if sp[1] not in predicates:
predicates[sp[1]] = []
predicates[sp[1]].append(sp) # 格式:{predicate: [[subject, predicate, object]],...}
for d in dev_data:
repair(d)
# 随机替换数据,即对一个句子中的三元组同p类型三元组进行替换
def random_generate(d, spo_list_key):
r = np.random.random()
if r > 0.5: # 若随机数大于0.5则不再打乱数据
return d
else:
# 产生0-len之间的随机数,len(d[spo_list_key])->为一个句子中的三元组个数,也就是说会对句子里的某个三元组进行替换
k1 = np.random.randint(len(d[spo_list_key]))
# 得到该条数据的一个随机三元组
spi = d[spo_list_key][k1] # ['s','p','o']
# 从predict词典中随机选取具有相同p的三元组
k2 = np.random.randint(len(predicates[spi[1]]))
spo = predicates[spi[1]][k2] # ['s','p','o']
# 将数据中的部分s,o进行替换
F = lambda s: s.replace(spi[0], spo[0]).replace(spi[2], spo[2])
# 对该句子中的对应的主语和宾语进行替换
text = F(d['text'])
# 修正三元组
spo_list = [(F(sp[0]), sp[1], F(sp[2])) for sp in d[spo_list_key]]
# 返回随机修改后的数据
return {'text': text, spo_list_key: spo_list}
# pad数据
def seq_padding(X, padding=None):
if padding is None:
padding = 0
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
])
class AC_Unicode:
"""稍微封装一下,弄个支持unicode的AC自动机
"""
def __init__(self):
self.ac = ahocorasick.Automaton()
def add_word(self, k, v):
# k = k.encode('utf-8')
self.ac.add_word(k, v)
def make_automaton(self):
self.ac.make_automaton()
def iter(self, s):
# s = s.encode('utf-8')
return self.ac.iter(s)
# 传入句子和句子的id
class spo_searcher:
def __init__(self, train_data):
# 构建主语关键字词自动机
self.s_ac = AC_Unicode()
# 构建宾语关键字词自动机
self.o_ac = AC_Unicode()
self.so2p = {}
self.spo_total = {}
# 对所有的数据提取三元组
for i, d in enumerate(train_data):
# train_data的形式应为[{'text':'','spo_list':[['s','p','o']]}]
for s, p, o in d['spo_list']:
# 建立自动机字典
self.s_ac.add_word(s, s)
self.o_ac.add_word(o, o)
if (s, o) not in self.so2p:
self.so2p[(s, o)] = set()
if (s, p, o) not in self.spo_total:
self.spo_total[(s, p, o)] = set()
# so2p={(s,o):{p}},对于相同的s o 会有多少种不同的p
self.so2p[(s, o)].add(p)
# spo_total={(s,p,o):{i}},表示这种类型的三元组出现在的数据中的id,即出现了多少次
self.spo_total[(s, p, o)].add(i)
# 构建自动机树
self.s_ac.make_automaton()
self.o_ac.make_automaton()
# 使用构建的自动机提取某个句子中的三元组,这个自动机包含训练集中所有的s和o关键词字典
# 对输入的测试集提取句子中的三元组
def extract_items(self, text_in, text_idx=None):
R = set()
# 对输入的句子进行自动机树遍历
# 寻找主语
for s in self.s_ac.iter(text_in):
# 寻找宾语
for o in self.o_ac.iter(text_in):
# 对于句子中的每个可能的so,如果存在则构建
if (s[1], o[1]) in self.so2p:
# 对于其中的每个p构建s,p,o三元组,得到结果列表
for p in self.so2p[(s[1], o[1])]:
# None stand for?
if text_idx is None:
# R->{('s','p','o')}
R.add((s[1], p, o[1]))
# {id1,id2,...}-{idi}
elif self.spo_total[(s[1], p, o[1])] - {text_idx}:
R.add((s[1], p, o[1]))
# 返回根据自动机关键词典提取到的句子中的三元组
# R={(s,p,o)}->[(s,p,o)]
return list(R)
# 实例化
spoer = spo_searcher(train_data)
class data_generator:
def __init__(self, data, batch_size=64):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
# 创建长度为数据量的无序列表用以打乱数据
idxs = [i for i in range(len(self.data))]
np.random.shuffle(idxs)
# 创建10个列表,不知道干啥
T1, T2, S1, S2, K1, K2, O1, O2, PRES, PREO = [], [], [], [], [], [], [], [], [], []
for i in idxs:
spo_list_key = 'spo_list' # if np.random.random() > 0.5 else 'spo_list_with_pred'
# 重构数据,相当于数据增强
d = random_generate(self.data[i], spo_list_key) # 对数据进行随机替换
# 对句子长度进行切分
text = d['text'][:maxlen]
# 分词操作
text_words = tokenize(text)
# 对分词后的句子拼接为字符串,还原句子
text = ''.join(text_words)
# items保存了句子的s,o的初始位置以及其长度
items = {}
# [[s,p,o],...]
for sp in d[spo_list_key]:
# 在对应句子中寻找主语和宾语的位置
subjectid = text.find(sp[0])
objectid = text.find(sp[2])
# 不存在为-1
if subjectid != -1 and objectid != -1:
# (sid,sid+slen)
key = (subjectid, subjectid + len(sp[0]))
if key not in items.keys():
items[key] = []
# {(sid,sid+slen):[(oid,oid+olen,pid),...]}
items[key].append((objectid,
objectid + len(sp[2]),
predicate2id[sp[1]])) # predicate2id={p:id,...}
# 利用自动机再提取
pre_items = {}
# 根据自动机提取出该text中存在的三元组,# R->[('s','p','o')]
for sp in spoer.extract_items(text, i):
subjectid = text.find(sp[0])
objectid = text.find(sp[2])
if subjectid != -1 and objectid != -1:
key = (subjectid, subjectid + len(sp[0]))
if key not in pre_items:
pre_items[key] = []
# {(sid,sid+slen):[(oid,oid+olen,pid)]}
pre_items[key].append((objectid,
objectid + len(sp[2]),
predicate2id[sp[1]]))
# 如果items不为空,即该句子包含三元组
if items:
# 将文本转为字符id的形式(已经取过最大值),这里的text是修正后的text
# T1->[[],...]
T1.append([char2id.get(c, 1) for c in text]) # 1是unk,0是padding
# 分词的句子,T2->[[],...]
T2.append(text_words)
# s1,s2=[0,0,...]
s1, s2 = np.zeros(len(text)), np.zeros(len(text))
# items={(sid,sid+slen):[(oid,oid+olen,pid),...]}
# 标注句子中s的位置
for j in items:
# j[0]会得到键的第一个值,即s的首位置
# s1向量标记s的首位置
s1[j[0]] = 1
# s2向量标记s的尾位置
s2[j[1] - 1] = 1
# pres=[[0,0],[0,0],...],竖着标记s出现的位置,第一列为s的首位置,第二列标记s的尾位置
pres = np.zeros((len(text), 2))
# pre_items结构与items一样
for j in pre_items:
pres[j[0], 0] = 1
pres[j[1] - 1, 1] = 1
# 对items的键转为数组,包含该句中存在的所有s的首尾位置
a = []
for e in items.keys():
a.append(e)
# a->[(sid,sid+slen),...]
# k1=[s1首,...],k2=[s1尾,...]
k1, k2 = np.array(a).T
# 随机选取一个s的首位置
k1 = choice(k1)
# 从k1后的位置随机选取一个s尾,如果该句只含有一个三元组就必定选中
# k2>=k1=[False,...,True,...],这个向量可以把另一个等长的向量中值为True的位置的值提取出来
# 即k2不会是k1标记的实体的前面的实体,只可能为当前实体或其之后的实体尾位置
k2 = choice(k2[k2 >= k1])
# num_classes为p的个数,o1->(len,num),这样在固定的列上标记o的位置,不仅表示出了o还表示出了关系
o1, o2 = np.zeros((len(text), num_classes)), np.zeros((len(text), num_classes))
# 这的j是指返回的列表的元素,那么如果为空列表,自然不会对列表内容进行遍历,也就是说该遍历并不进行
# 不存在错误
for j in items.get((k1, k2), []):
# j->(os,oe,pid)
o1[j[0], j[2]] = 1
o2[j[1] - 1, j[2]] = 1
# preo->(len,num,2)
# 对于每个字标记,这里用另一种方式对宾语的位置及类别进行了标记,目前暂不清楚作用
preo = np.zeros((len(text), num_classes, 2))
for j in pre_items.get((k1, k2), []):
preo[j[0], j[2], 0] = 1
preo[j[1] - 1, j[2], 1] = 1
# preo->(len,num*2)
preo = preo.reshape((len(text), -1))
# S1保存每个句子的s开始01标签,S1->[[],...]
S1.append(s1)
# S2保存每个句子的s结束01标签,S2->[[],...]
S2.append(s2)
# K1保存随机提取的首位置,K1->[[k1],[k1],...]
K1.append([k1])
# K2保存随机提取的尾位置,k2->[[k2],[k2],...]
K2.append([k2 - 1])
# [[len(text),num_class]]
# O1保存宾语的首位置和关系p
O1.append(o1)
# O2保存宾语的尾位置和关系p
O2.append(o2)
# pres=[[0,0],[0,0],...],竖着标记s出现的位置
PRES.append(pres)
# preo->(len,num*2)
PREO.append(preo)
# pad所有数据,生成一个batch的数据并返回
if len(T1) == self.batch_size or i == idxs[-1]:
# T1字符id序列
T1 = seq_padding(T1)
# T2词列表
T2 = sent2vec(T2)
# S1是s首序列
S1 = seq_padding(S1)
# S2是s尾序列
S2 = seq_padding(S2)
#
O1 = seq_padding(O1, np.zeros(num_classes))
O2 = seq_padding(O2, np.zeros(num_classes))
#
K1, K2 = np.array(K1), np.array(K2)
#
PRES = seq_padding(PRES, np.zeros(2))
PREO = seq_padding(PREO, np.zeros(num_classes * 2))
#
yield [T1, T2, S1, S2, K1, K2, O1, O2, PRES, PREO], None
T1, T2, S1, S2, K1, K2, O1, O2, PRES, PREO = [], [], [], [], [], [], [], [], [], []
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam
# 即从句子的中抽取出采样到的s实体对应的词向量序列
def seq_gather(x):
"""seq是[None, seq_len, s_size]的格式,
idxs是[None, 1]的格式,在seq的第i个序列中选出第idxs[i]个向量,
最终输出[None, s_size]的向量。
"""
seq, idxs = x
idxs = K.cast(idxs, 'int32')
# 原则上讲这里shape(seq)[0]=None,肯定是不能作为参数传入的
batch_idxs = K.arange(0, K.shape(seq)[0]) # (batch,)
# (batch,1)
batch_idxs = K.expand_dims(batch_idxs, 1)
# (batch,2)
idxs = K.concatenate([batch_idxs, idxs], 1)
# 根据传入的idx提取每个batch(每个句子)中一个词向量
# 按照idxs从seq中抽取切片,idxs中每一行代表了每一个句子的s的首位置词向量表示,return=(batch,char_size)
return tf.gather_nd(seq, idxs)
def seq_maxpool(x):
"""seq是[None, seq_len, s_size]的格式,
mask是[None, seq_len, 1]的格式,先除去mask部分,
然后再做maxpooling。
"""
seq, mask = x
# mask
seq -= (1 - mask) * 1e10
# 取出每一列的最大值,return=(batch,1,char_size)
return K.max(seq, 1, keepdims=True)
def dilated_gated_conv1d(seq, mask, dilation_rate=1):
"""膨胀门卷积(残差式)
"""
# seq=(batch,len,char_size),mask=(batch,len,1)
dim = K.int_shape(seq)[-1]
# h=(batch,len,char_size*2)
h = Conv1D(dim * 2, 3, padding='same', dilation_rate=dilation_rate)(seq)
def _gate(x):
dropout_rate = 0.1
s, h = x
# g=h=(batch,len,char_size)
g, h = h[:, :, :dim], h[:, :, dim:]
# k.in_train_phase(x,y)训练阶段返回x,其他阶段返回y,在训练阶段加入噪声
g = K.in_train_phase(K.dropout(g, dropout_rate), g)
# sigmoid激活
g = K.sigmoid(g)
# g,s点乘,残差连接机制
return g * s + (1 - g) * h
# seq=(batch,len,char_size)
seq = Lambda(_gate)([seq, h])
seq = Lambda(lambda x: x[0] * x[1])([seq, mask])
return seq
class Attention(Layer):
"""多头注意力机制
"""
def __init__(self, nb_head, size_per_head, **kwargs):
self.nb_head = nb_head
self.size_per_head = size_per_head
self.out_dim = nb_head * size_per_head
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
super(Attention, self).build(input_shape)
# 输入三个数据
q_in_dim = input_shape[0][-1]
k_in_dim = input_shape[0][-1]
v_in_dim = input_shape[0][-1]
self.q_kernel = self.add_weight(name='q_kernel',
shape=(q_in_dim, self.out_dim),
initializer='glorot_normal')
self.k_kernel = self.add_weight(name='k_kernel',
shape=(k_in_dim, self.out_dim),
initializer='glorot_normal')
self.v_kernel = self.add_weight(name='w_kernel',
shape=(v_in_dim, self.out_dim),
initializer='glorot_normal')
def mask(self, x, mask, mode='mul'):
# mask=(batch,len,1)
if mask is None:
return x
else:
# ndim()返回数据的维数,x=4,mask=3;mask=(batch,len,1,1)
for _ in range(K.ndim(x) - K.ndim(mask)):
mask = K.expand_dims(mask, K.ndim(mask))
if mode == 'mul':
return x * mask
else:
return x - (1 - mask) * 1e10
def call(self, inputs, **kwargs):
q, k, v = inputs[:3]
v_mask, q_mask = None, None
if len(inputs) > 3:
v_mask = inputs[3]
if len(inputs) > 4:
q_mask = inputs[4]
# 线性变换,qw=kw=vw=(batch,len,out_dim=head_num*head_dim)
qw = K.dot(q, self.q_kernel)
kw = K.dot(k, self.k_kernel)
vw = K.dot(v, self.v_kernel)
self.xlen = K.shape(qw)[1]
# 形状变换,qw=kw=vw=(batch,len,head_num,head_dim)
qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
# 维度置换,qw=kw=vw=(batch,head_num,len,head_dim)
qw = K.permute_dimensions(qw, (0, 2, 1, 3))
kw = K.permute_dimensions(kw, (0, 2, 1, 3))
vw = K.permute_dimensions(vw, (0, 2, 1, 3))
# 由于keras版本问题,对三个矩阵进行reshape=(batch*head_num,len,head_num)
qw = K.reshape(qw, (-1, K.shape(qw)[-2], K.shape(qw)[-1]))
kw = K.reshape(kw, (-1, K.shape(kw)[-2], K.shape(kw)[-1]))
vw = K.reshape(vw, (-1, K.shape(vw)[-2], K.shape(vw)[-1]))
# Attention,a=(batch,head_num,len,len) # a=(batch*head_num,len,len)
a = K.batch_dot(qw, kw, axes=[2, 2]) / self.size_per_head ** 0.5
# a=(batch,len,len,head_num),由于版本问题需要还原shape
a = K.reshape(a, (-1, self.nb_head, self.xlen, self.xlen))
a = K.permute_dimensions(a, (0, 3, 2, 1))
a = self.mask(a, v_mask, 'add')
# a=(batch,head_num,len,len)
a = K.permute_dimensions(a, (0, 3, 2, 1))
# 对注意力分数进行归一化
a = K.softmax(a)
# a=(batch*head_num,len,len),vw=(batch*head_num,len,head_dim)-->o=(batch*head_num,len,head_dim)
a = K.reshape(a, [-1, self.xlen, self.xlen])
vw = K.reshape(vw, [-1, self.xlen, self.size_per_head])
o = K.batch_dot(a, vw, axes=[2, 1])
# o = K.batch_dot(a, vw, [3, 2])
# o=(batch,head_num,len,head_dim)
o = K.reshape(o, [-1, self.nb_head, self.xlen, self.size_per_head])
# o=(batch,len,head_num,head_dim)
o = K.permute_dimensions(o, (0, 2, 1, 3))
# o=(batch,len,head_num*head_dim)
o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
o = self.mask(o, q_mask, 'mul')
return o
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1], self.out_dim)
# T1字符id序列
t1_in = Input(shape=(None,))
# T2词序列
t2_in = Input(shape=(None, word_size))
# S1保存每个句子的s开始01标签
s1_in = Input(shape=(None,))
# S2保存每个句子的s结束01标签
s2_in = Input(shape=(None,))
# K1保存首位置
k1_in = Input(shape=(1,))
# K2保存尾位置
k2_in = Input(shape=(1,))
# O1保存宾语的首位置和关系p
o1_in = Input(shape=(None, num_classes))
# O2保存宾语的尾位置和关系p
o2_in = Input(shape=(None, num_classes))
# pres=[[0,0],[0,0],...],竖着标记s出现的位置
pres_in = Input(shape=(None, 2))
# preo->(len,num*2)
preo_in = Input(shape=(None, num_classes * 2))
t1, t2, s1, s2, k1, k2, o1, o2, pres, preo = t1_in, t2_in, s1_in, s2_in, k1_in, k2_in, o1_in, o2_in, pres_in, preo_in
# 根据t1得到mask,t1为字符id序列
# k.greater(x,y),x与y进行比较,x>y=1,x<y=0;mask->(batch,len,1)
mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(t1)
def position_id(x):
# 这里是在提取到s后,用作相对位置编码使用,传入t和k1,k2;t=(batch,len,char_size),k1=k2=(batch,1)
if isinstance(x, list) and len(x) == 2:
x, r = x
else:
r = 0
# k.int_shape(),pid=[0,1,...,len-1]
pid = K.arange(K.shape(x)[1])
# (1,len)
pid = K.expand_dims(pid, 0)
# 同纬度扩张数据的倍数,pid=(batch,len)
pid = K.tile(pid, [K.shape(x)[0], 1])
# 求解句子中每个位置相对于实体的相对位置=(batch,len)-0
return K.abs(pid - K.cast(r, 'int32'))
# t1为张量,且shape=(batch,len),pid=(batch,len)
pid = Lambda(position_id)(t1)
# 创建位置编码层,初始值为0
position_embedding = Embedding(maxlen, char_size, embeddings_initializer='zeros')
# 位置编码,pv=(batch,len,char_size)
pv = position_embedding(pid)
# 创建字向量编码层,t1=(batch,len,char_size)
t1 = Embedding(len(char2id) + 2, char_size)(t1) # 0: padding, 1: unk
# t2=(batch,len,char_size)
t2 = Dense(char_size, use_bias=False)(t2) # 词向量也转为同样维度
t = Add()([t1, t2, pv]) # 字向量、词向量、位置向量相加
t = Dropout(0.25)(t)
# mask=(batch,len,1),这样会将pad的数据转为0
t = Lambda(lambda x: x[0] * x[1])([t, mask])
# 已经mask过了,还mask?;t=(batch,len,char_size);mask=(batch,len,1);t=(batch,len,char_size)也就是说共享编码阶段的维度为char_size
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 2)
t = dilated_gated_conv1d(t, mask, 5)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 2)
t = dilated_gated_conv1d(t, mask, 5)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 2)
t = dilated_gated_conv1d(t, mask, 5)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 1)
t = dilated_gated_conv1d(t, mask, 1)
######################################################################前面为共享编码阶段,维度为(batch,len,char_size)
# t_dim=char_size
t_dim = K.int_shape(t)[-1]
####################################################################加入全局信息#全局信息由编码器得到
pn1 = Dense(char_size, activation='relu')(t)
# pn1->(batch,len,1)
pn1 = Dense(1, activation='sigmoid')(pn1)
pn2 = Dense(char_size, activation='relu')(t)
# pn2->(batch,len,1)
pn2 = Dense(1, activation='sigmoid')(pn2)
####################################################################全局信息
###############################################################################开始抽取s
# 自注意力机制
h = Attention(8, 16)([t, t, t, mask])
# 加入先验特征,t=(batch,len,char_size),pres=(batch,len,2),h=(batch,len,128)
h = Concatenate()([t, h, pres])
# (batch,len,char_size)
h = Conv1D(char_size, 3, activation='relu', padding='same')(h)
# 得到句中s的标注位置(batch,len,1)
ps1 = Dense(1, activation='sigmoid')(h)
ps2 = Dense(1, activation='sigmoid')(h)
# 利用全局信息,ps1=ps2=(batch,len,1)
ps1 = Lambda(lambda x: x[0] * x[1])([ps1, pn1])
ps2 = Lambda(lambda x: x[0] * x[1])([ps2, pn2])
# 主语即s预测模型:输入:字向量t1_in,词向量t2_in,先验特征pres_in;输出:s的指针序列,ps1首指针序列,ps2尾指针序列->(batch,len,1)
subject_model = Model([t1_in, t2_in, pres_in], [ps1, ps2]) # 预测subject的模型
####################################################################### 构建pc全局信息
# t_max=(batch,1,char_size),对编码层表示进行mask+maxpooling
t_max = Lambda(seq_maxpool)([t, mask])
pc = Dense(char_size, activation='relu')(t_max)
# pc=(batch,1,num_classes),全局关系检测模块
pc = Dense(num_classes, activation='sigmoid')(pc)
# 根据固定长度6提取实体
def get_k_inter(x, n=6):
# seq->(batch,len,char_size),k1,k2->(batch,1)
seq, k1, k2 = x
# k.round()->对数据四舍五入,k_inter[(batch,1),...],a=[0,0.1,0.2,...]
k_inter = [K.round(k1 * a + k2 * (1 - a)) for a in np.arange(n) / (n - 1.)]
# k->(batch,1),k_inter=[(batch,char_size),...]
k_inter = [seq_gather([seq, k]) for k in k_inter]
# k_inter=[(batch,1,char_size),...]
k_inter = [K.expand_dims(k, 1) for k in k_inter]
# k_inter=(batch,6,char_size)
k_inter = K.concatenate(k_inter, 1)
return k_inter
#
# k1=k2=(batch,1)
# t为共享编码阶段得到的句子的词向量表示,输出维度为(batch,6,char_size);k就是提取到的s的词向量表示,其被pad到6
k = Lambda(get_k_inter, output_shape=(6, t_dim))([t, k1, k2])
# 对提取到的k进行rnn编码->(batch,char_size*2) # 因为这里只返回了rnn的最后状态作为计算
k = Bidirectional(GRU(t_dim))(k)
# t=(batch,len,char_size);k1=k2=(batch,1);k1v=k2v=(batch,len,char_size);Lambda(position_id)([t, k1])->(batch,len)
k1v = position_embedding(Lambda(position_id)([t, k1]))
k2v = position_embedding(Lambda(position_id)([t, k2]))
# kv=(batch,len,2*char_size)
kv = Concatenate()([k1v, k2v])
# k=(batch,1,char_size*2)->(batch,len,char_size*2)
k = Lambda(lambda x: K.expand_dims(x[0], 1) + x[1])([k, kv])
# attention2,h=(batch,len,128)
h = Attention(8, 16)([t, t, t, mask])
# 对多种特征进行拼接,# preo->(batch,len,num*2),pres->(batch,len,2)
h = Concatenate()([t, h, k, pres, preo])
# h->(batch,len,char_size)
h = Conv1D(char_size, 3, activation='relu', padding='same')(h)
# 全局信息 po->(batch,len,1)
po = Dense(1, activation='sigmoid')(h)
# 提取o首指针 po1->(batch,len,num)
po1 = Dense(num_classes, activation='sigmoid')(h)
# 提取o尾指针 po2->(batch,len,num)
po2 = Dense(num_classes, activation='sigmoid')(h)
# 加上全局信息;po=(batch,len,1),po1=po2=(batch,len,num),pc=(batch,1,num),pn1=pn2=(batch,len,1)
po1 = Lambda(lambda x: x[0] * x[1] * x[2] * x[3])([po, po1, pc, pn1])
po2 = Lambda(lambda x: x[0] * x[1] * x[2] * x[3])([po, po2, pc, pn2])
# 宾语预测模型
object_model = Model([t1_in, t2_in, k1_in, k2_in, pres_in, preo_in], [po1, po2]) # 输入text和subject,预测object及其关系
# 主训练模型
train_model = Model([t1_in, t2_in, s1_in, s2_in, k1_in, k2_in, o1_in, o2_in, pres_in, preo_in],
[ps1, ps2, po1, po2])
# s1=(batch,len)->(batch,len,1)
s1 = K.expand_dims(s1, 2)
s2 = K.expand_dims(s2, 2)
# 计算s的交叉熵损失 s1_loss=(batch,len,1)
s1_loss = K.binary_crossentropy(s1, ps1)
s1_loss = K.sum(s1_loss * mask) / K.sum(mask)
s2_loss = K.binary_crossentropy(s2, ps2)
s2_loss = K.sum(s2_loss * mask) / K.sum(mask)
# 计算o的交叉熵损失
o1_loss = K.sum(K.binary_crossentropy(o1, po1), 2, keepdims=True)
o1_loss = K.sum(o1_loss * mask) / K.sum(mask)
o2_loss = K.sum(K.binary_crossentropy(o2, po2), 2, keepdims=True)
o2_loss = K.sum(o2_loss * mask) / K.sum(mask)
# 总损失
loss = (s1_loss + s2_loss) + (o1_loss + o2_loss)
# 添加损失
train_model.add_loss(loss)
# 编译模型
train_model.compile(optimizer=Adam(1e-3))
train_model.summary()
class ExponentialMovingAverage:
"""对模型权重进行指数滑动平均。
用法:在model.compile之后、第一次训练之前使用;
先初始化对象,然后执行inject方法。
"""
# k.set_value(x,value)->x需要是一个变量:tf.Variable(),k.zeros()等创建的量为Variable;value为一个numpy的数组
def __init__(self, model, momentum=0.9999):
self.momentum = momentum
self.model = model
# 得到模型权重的零矩阵列表,模型的权重以列表的形式保存每个层的权重;且权重的形式为numpy数组的形式返回
# self.ema_weights=[[]->variable,[],...]
# model.weights->[[]->variable,...]
# k.get_value()->将一个tensor转换为numpy数组返回;k.batch_get_value()->将一个batch型tensor对每个batch作为单独的numpy数组
# 并共同构成一个列表返回
# model.weights直接返回的是variable的列表;k.batch_get_value()返回的是numpy数组形式的列表;
# k.get_value()只能操作tensor或variable,这里的模型权重为列表,所以不能用,并不是因为模型权重参数有batch特征
self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights]
def inject(self):
"""添加更新算子到model.metrics_updates。
"""
self.initialize()
for w1, w2 in zip(self.ema_weights, self.model.weights):
op = K.moving_average_update(w1, w2, self.momentum)
self.model.add_metric(op)
def initialize(self):
"""ema_weights初始化跟原模型初始化一致。
"""
# 得到权重numpy矩阵的列表形式
self.old_weights = K.batch_get_value(self.model.weights)
# 对初始化的矩阵进行赋值
K.batch_set_value(zip(self.ema_weights, self.old_weights))
def apply_ema_weights(self):
"""备份原模型权重,然后将平均权重应用到模型上去。
"""
self.old_weights = K.batch_get_value(self.model.weights)
ema_weights = K.batch_get_value(self.ema_weights)
K.batch_set_value(zip(self.model.weights, ema_weights))
def reset_old_weights(self):
"""恢复模型到旧权重。
"""
K.batch_set_value(zip(self.model.weights, self.old_weights))
EMAer = ExponentialMovingAverage(train_model)
EMAer.inject()
# 对验证集进行提取
def extract_items(text_in):
# 分词操作
text_words = tokenize(text_in.lower())
text_in = ''.join(text_words)
pre_items = {}
# 利用自动机提取该句子中的三元组,返回形式为=[(s,p,o),...]
for sp in spoer.extract_items(text_in):
# 寻找s和o在句子中的位置,不存在则返回-1
subjectid = text_in.find(sp[0])
objectid = text_in.find(sp[2])
if subjectid != -1 and objectid != -1:
key = (subjectid, subjectid + len(sp[0]))
if key not in pre_items:
pre_items[key] = []
# {(s,s+len):[(o,o+len,pid),...],...}
pre_items[key].append((objectid,
objectid + len(sp[2]),
predicate2id[sp[1]]))
# _pres=(len,2)
_pres = np.zeros((len(text_in), 2))
# 标记提取到的s
for j in pre_items:
_pres[j[0], 0] = 1
_pres[j[1] - 1, 1] = 1
# _pres=(1,len,2)
_pres = np.expand_dims(_pres, 0)
#
R = []
# _t1为句子的id序列
_t1 = [char2id.get(c, 1) for c in text_in]
# 转化为numpy数组,(len,)
_t1 = np.array([_t1])
# 对句子的词序列进行embedding,(len,char_size)
_t2 = sent2vec([text_words])
# 得到预测的s的首尾位置,_k1=_k2=(1,len,1),只传入一组数据
_k1, _k2 = subject_model.predict([_t1, _t2, _pres])
# 取出数据的预测结果,_k1=_k2=(len,1)
_k1, _k2 = _k1[0, :, 0], _k2[0, :, 0]
# np.where(condition)返回满足条件的索引;这里取出第一个可能值大于0.5的索引位置,尾位置值设为0.4
# _k1,_k为一个满足条件的索引的numpy数组
_k1, _k2 = np.where(_k1 > 0.5)[0], np.where(_k2 > 0.4)[0]
# 初始化两个列表
_subjects, _PREO = [], []
# 对_k1中每一个索引位置
for i in _k1:
# 取出_k2中索引位于该索引值后的索引,因为s的尾指针肯定在其首指针后
j = _k2[_k2 >= i]
# 如果存在s实体
if len(j) > 0:
# 取出第一个可能的尾指针
j = j[0]
# 从原句中取出首尾指针指向的实体;
_subject = text_in[i: j + 1]
# 添加实体组=(实体,实体首位置,实体尾位置)
_subjects.append((_subject, i, j))
# _preo=(len,num,2)
_preo = np.zeros((len(text_in), num_classes, 2))
# 对自动机提取到的三元组,为宾语矩阵赋值
for _ in pre_items.get((i, j + 1), []): # [(oid,oid+len(o),pid),...]
_preo[_[0], _[2], 0] = 1
_preo[_[1] - 1, _[2], 1] = 1
# _preo=(len,num_class*2)
_preo = _preo.reshape((len(text_in), -1))
# _PREO=[[len,num_class*2],...]
_PREO.append(_preo)
# 如果提取到了实体
if _subjects:
# _pres标记了s的首尾指针 _pres=(1,len,2)->(len(subject),len,2)
_PRES = np.repeat(_pres, len(_subjects), 0)
# (n,len,num_class*2)
_PREO = np.array(_PREO)
# _t1->(len(sub)*len(text),)
_t1 = np.repeat(_t1, len(_subjects), 0)
# _t2->(len(word)*len(sub),word_size)?
_t2 = np.repeat(_t2, len(_subjects), 0)
# 取到s的索引值=[[i,j],...]->(n,2)->(2,n)->(2,n,1);k1保存首位置,k2保存尾指针->(1,n,1)
_k1, _k2 = np.array([_s[1:] for _s in _subjects]).T.reshape((2, -1, 1))
# 宾语模型得到宾语的位置,o1=o2=(len,num_class)
_o1, _o2 = object_model.predict([_t1, _t2, _k1, _k2, _PRES, _PREO])
# _subjects=[(实体,i,j),...]
for i, _subject in enumerate(_subjects):
_oo1, _oo2 = np.where(_o1[i] > 0.5), np.where(_o2[i] > 0.4)
for _ooo1, _c1 in zip(*_oo1):
for _ooo2, _c2 in zip(*_oo2):
if _ooo1 <= _ooo2 and _c1 == _c2:
_object = text_in[_ooo1: _ooo2 + 1]
_predicate = id2predicate[_c1]
R.append((_subject[0], _predicate, _object))
break
zhuanji, gequ = [], []
for s, p, o in R[:]:
if p == u'妻子':
R.append((o, u'丈夫', s))
elif p == u'丈夫':
R.append((o, u'妻子', s))
if p == u'所属专辑':
zhuanji.append(o)
gequ.append(s)
spo_list = set()
for s, p, o in R:
if p in [u'歌手', u'作词', u'作曲']:
if s in zhuanji and s not in gequ:
continue
spo_list.add((s, p, o))
return list(spo_list)
else:
return []
class Evaluate(Callback):
def __init__(self):
super(Evaluate, self).__init__()
self.F1 = []
self.best = 0.
self.passed = 0
self.stage = 0
def on_batch_begin(self, batch, logs=None):
"""第一个epoch用来warmup,不warmup有不收敛的可能。
"""
# steps->一个epoch的总步数
if self.passed < self.params['steps']:
lr = (self.passed + 1.) / self.params['steps'] * 1e-3
K.set_value(self.model.optimizer.lr, lr)
self.passed += 1
def on_epoch_end(self, epoch, logs=None):
# 一个epoch结束则将滑动后的模型权重赋值给模型权重
EMAer.apply_ema_weights()
#
f1, precision, recall = self.evaluate()
self.F1.append(f1)
if f1 > self.best:
self.best = f1
train_model.save_weights('best_model.weights')
print('f1: %.4f, precision: %.4f, recall: %.4f, best f1: %.4f\n' % (f1, precision, recall, self.best))
# 对权重进行恢复
EMAer.reset_old_weights()
if epoch + 1 == 50 or (
self.stage == 0 and epoch > 10 and
(f1 < 0.5 or np.argmax(self.F1) < len(self.F1) - 8)
):
self.stage = 1
train_model.load_weights('best_model.weights')
EMAer.initialize()
K.set_value(self.model.optimizer.lr, 1e-4)
K.set_value(self.model.optimizer.iterations, 0)
opt_weights = K.batch_get_value(self.model.optimizer.weights)
opt_weights = [w * 0. for w in opt_weights]
K.batch_set_value(zip(self.model.optimizer.weights, opt_weights))
def evaluate(self):
orders = ['subject', 'predicate', 'object']
#
A, B, C = 1e-10, 1e-10, 1e-10
F = open('dev_pred.json', 'w')
for d in tqdm(iter(dev_data)):
# R=[(s,p,o),...] 提取句子中的三元组
R = set(extract_items(d['text']))
T = set(d['spo_list'])
A += len(R & T)
B += len(R)
C += len(T)
s = json.dumps({
'text': d['text'],
'spo_list': [
dict(zip(orders, spo)) for spo in T
],
'spo_list_pred': [
dict(zip(orders, spo)) for spo in R
],
'new': [
dict(zip(orders, spo)) for spo in R - T
],
'lack': [
dict(zip(orders, spo)) for spo in T - R
]
}, ensure_ascii=False, indent=4)
F.write(s.encode('utf-8') + '\n')
F.close()
# 返回三个值
return 2 * A / (B + C), A / B, A / C
def test(test_data):
"""输出测试结果
"""
orders = ['subject', 'predicate', 'object', 'object_type', 'subject_type']
F = open('test_pred.json', 'w')
for d in tqdm(iter(test_data)):
R = set(extract_items(d['text']))
s = json.dumps({
'text': d['text'],
'spo_list': [
dict(zip(orders, spo + ('', ''))) for spo in R
]
}, ensure_ascii=False)
F.write(s.encode('utf-8') + '\n')
F.close()
train_D = data_generator(train_data)
evaluator = Evaluate()
if __name__ == '__main__':
train_model.fit_generator(train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=120,
callbacks=[evaluator]
)
train_model.add_metric()
else:
train_model.load_weights('best_model.weights')