1.项目背景
从网络爬虫获取50000条新闻训练数据,每条数据的格式是第一句是一条新闻的摘要summarization,接下来是新闻详情text。任务是通过搭建baseline模型和Bert模型训练数据,分别输入几段新闻,输出新闻的摘要。
数据准备:
创建vocab.json文件,存放字典表,填充字符padding、unk、start和end标记在字典表前4位
if os.path.exists('vocab.json'):
chars, id2char, char2id = json.load(open('vocab.json'))
id2char = {int(i): j for i, j in id2char.items()}
else:
chars = {}
for a in tqdm(df.values):
for w in a[1]: # 纯文本,不用分词
chars[w] = chars.get(w, 0) + 1
for w in a[0]: # 纯文本,不用分词
chars[w] = chars.get(w, 0) + 1
chars = {i: j for i, j in chars.items() if j >= min_count}
# 0: padding
# 1: unk
# 2: start
# 3: end
id2char = {i + 4: j for i, j in enumerate(chars)}
char2id = {j: i for i, j in id2char.items()}
json.dump([chars, id2char, char2id], open('vocab.json', 'w'))
2.基于Self-attention搭建baseline版本
class SelfAttention(Layer):
def __init__(self, n_head, head_size, **kwargs):
self.n_head = n_head
self.head_size = head_size
self.out_dim = n_head * head_size
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
super(SelfAttention, self).build(input_shape)
#batch_size sequence_length embed_size
q_dim = input_shape[0][-1]
k_dim = input_shape[1][-1]
self.v_dim = input_shape[2][-1]
self.q_matrix = self.add_weight(name='s', dtype='float32',
shape=(q_dim, self.out_dim),
initializer='glorot_normal')
self.k_matrix = self.add_weight(name='k_matrix', dtype='float32',
shape=(k_dim, self.out_dim),
initializer='glorot_normal')
self.v_matrix = self.add_weight(name='v_matrix', dtype='float32',
shape=(self.v_dim, self.out_dim),
initializer='glorot_normal')
self.head_matrix = self.add_weight(name='head_matrix', dtype='float32',
shape=(self.out_dim, self.v_dim),
initializer='glorot_normal')
def call(self, inputs, **kwargs):
# [batch_size, seq_len, 1]
q, k, v = inputs
qw = K.dot(q, self.q_matrix)
kw = K.dot(k, self.k_matrix)
vw = K.dot(v, self.v_matrix)
# [batch_size, seq_len, n_head, head_size]
qw = K.reshape(qw, [-1, K.shape(qw)[1], self.n_head, self.head_size])
kw = K.reshape(kw, [-1, K.shape(kw)[1], self.n_head, self.head_size])
vw = K.reshape(vw, [-1, K.shape(vw)[1], self.n_head, self.head_size])
# [batch_size, n_head, seq_len, head_size]
qw = K.permute_dimensions(qw, [0, 2, 1, 3])
kw = K.permute_dimensions(kw, [0, 2, 1, 3])
vw = K.permute_dimensions(vw, [0, 2, 1, 3])
# [batch_size, n_head, seq_len, seq_len]
out = K.batch_dot(qw, kw, axes=[3, 3]) / (self.head_size ** 0.5)
out = K.softmax(out)
out = K.batch_dot(out, vw, [2, 3])
# [batch_size, seq_len, n_head*head_size]
out = K.reshape(out, [-1, K.shape(out)[1], self.out_dim])
# [batch_size, seq_len, v_dim]
out = K.dot(out, self.head_matrix)
return out
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1], self.v_dim)
q = Input(shape=[None])
q_embedding = Embedding(10000, 300)(q)
mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(q)
out = SelfAttention(8, 64)([q_embedding, q_embedding, q_embedding])
3.BiLSTM的baseline模型
模型结构优化:
1、采用类似multi-head attention的机制来替代普通的attention做seq2seq
2、添加LayerNormalization(见论文https://arxiv.org/pdf/1607.06450v1.pdf)
3、mask机制:
主要原因在于在处理不等长的序列时,填充了0,对于这些0应该做一定的处理。通常定义一个mask的tensor维度为[batch_size, seq_len, 1]。普通情况,mask与data做一个对位相乘的操作,把是0 的位置的值置0,如果需要做softmax操作,需要把是0的位置改为一个很大的负数
4、双向模型
5、引入先验特征
引入先验知识通常能够加快收敛。由于输入语言和输出语言都是中文,因此encoder和decoder的Embedding层可以共享参数(也就是用同一套词向量)。这使得模型的参数量大幅度减少了。
此外,还有一个很有用的先验知识:摘要中的大部分字词都在文章中出现过(仅仅是出现过,并不一定是连续出现,更不能说摘要包含在文章中,不然就成为一个普通的序列标注问题了)。这样一来,我们可以用文章中的词集作为一个先验分布,加到解码过程的分类模型中,使得模型在解码输出时更倾向选用文章中已有的字词。
原始的分类方案:具体来说,在每一步预测时,我们得到总向量x,然后接入到全连接层,最终得到一个大小为|V|的向量y=(y1,y2,…,y|V|),其中|V|是词表的词数。y经过softmax后,得到原本的概率。
引入先验分布的方案:对于每篇文章,我们得到一个大小为|V|的0/1向量χ=(χ1,χ2,…,χ|V|),其中χi=1意味着该词在文章中出现过,否则χi=0。将这样的一个0/1向量经过一个缩放平移层得到,然后将这个向量与原来的y取平均后才做softmax。
6、集束搜索beam search编码
参考https://en.wikipedia.org/wiki/Beam_search,https://www.jianshu.com/p/bc3beb101885
1.学习输出的先验分布
class ScaleShift(Layer):
"""缩放平移变换层(Scale and shift)
"""
def __init__(self, **kwargs):
super(ScaleShift, self).__init__(**kwargs)
def build(self, input_shape):
# (1, 1, 5860)
kernel_shape = (1,) * (len(input_shape) - 1) + (input_shape[-1],)
self.log_scale = self.add_weight(name='log_scale',
shape=kernel_shape,
initializer='zeros')
self.shift = self.add_weight(name='shift',
shape=kernel_shape,
initializer='zeros')
def call(self, inputs, **kwargs):
x_outs = K.exp(self.log_scale) * inputs + self.shift
return x_outs
2.双向RNN
class OurBidirectional(OurLayer):
"""自己封装双向RNN,允许传入mask,保证对齐
"""
def __init__(self, layer, **args):
super(OurBidirectional, self).__init__(**args)
self.forward_layer = copy.deepcopy(layer)
self.backward_layer = copy.deepcopy(layer)
self.forward_layer.name = 'forward_' + self.forward_layer.name
self.backward_layer.name = 'backward_' + self.backward_layer.name
def reverse_sequence(self, x, mask):
"""这里的mask.shape是[batch_size, seq_len, 1]
"""
seq_len = K.round(K.sum(mask, 1)[:, 0])
seq_len = K.cast(seq_len, 'int32')
#对第2维度翻转,也就是seq_length
return K.tf.reverse_sequence(x, seq_len, seq_dim=1)
def call(self, inputs, **kwargs):
x, mask = inputs
x_forward = self.reuse(self.forward_layer, x)
x_backward = self.reverse_sequence(x, mask)
x_backward = self.reuse(self.backward_layer, x_backward)
#和正向序列对齐
x_backward = self.reverse_sequence(x_backward, mask)
x = K.concatenate([x_forward, x_backward], 2)
return x * mask
def compute_output_shape(self, input_shape):
return (None, input_shape[0][1], self.forward_layer.units * 2)
3.多头注意力机制
class Attention(OurLayer):
"""多头注意力机制
"""
def __init__(self, heads, size_per_head, key_size=None,
mask_right=False, **kwargs):
super(Attention, self).__init__(**kwargs)
self.heads = heads
self.size_per_head = size_per_head
self.out_dim = heads * size_per_head
self.key_size = key_size if key_size else size_per_head
self.mask_right = mask_right
def build(self, input_shape):
super(Attention, self).build(input_shape)
self.q_dense = Dense(self.key_size * self.heads, use_bias=False)
self.k_dense = Dense(self.key_size * self.heads, use_bias=False)
self.v_dense = Dense(self.out_dim, use_bias=False)
def mask(self, x, mask, mode='mul'):
if mask is None:
return x
else:
for _ in range(K.ndim(x) - K.ndim(mask)):
mask = K.expand_dims(mask, K.ndim(mask))
if mode == 'mul':
return x * mask
else:
return x - (1 - mask) * 1e10
def call(self, inputs):
q, k, v = inputs[:3]
v_mask, q_mask = None, None
if len(inputs) > 3:
v_mask = inputs[3]
if len(inputs) > 4:
q_mask = inputs[4]
# 线性变换
qw = self.reuse(self.q_dense, q)
kw = self.reuse(self.k_dense, k)
vw = self.reuse(self.v_dense, v)
# 形状变换
qw = K.reshape(qw, (-1, K.shape(qw)[1], self.heads, self.key_size))
kw = K.reshape(kw, (-1, K.shape(kw)[1], self.heads, self.key_size))
vw = K.reshape(vw, (-1, K.shape(vw)[1], self.heads, self.size_per_head))
# 维度置换
qw = K.permute_dimensions(qw, (0, 2, 1, 3))
kw = K.permute_dimensions(kw, (0, 2, 1, 3))
vw = K.permute_dimensions(vw, (0, 2, 1, 3))
# Attention
a = K.batch_dot(qw, kw, [3, 3]) / self.key_size ** 0.5
a = K.permute_dimensions(a, (0, 3, 2, 1))
a = self.mask(a, v_mask, 'add')
a = K.permute_dimensions(a, (0, 3, 2, 1))
if self.mask_right:
ones = K.ones_like(a[:1, :1])
mask = (ones - K.tf.matrix_band_part(ones, -1, 0)) * 1e10
a = a - mask
a = K.softmax(a)
# 完成输出
o = K.batch_dot(a, vw, [3, 2])
o = K.permute_dimensions(o, (0, 2, 1, 3))
o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
o = self.mask(o, q_mask, 'mul')
return o
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1], self.out_dim)
4.模型构建
def graph():
x_in = Input(shape=(None,))
y_in = Input(shape=(None,))
x, y = x_in, y_in
x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x)
y_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(y)
x_one_hot = Lambda(to_one_hot)([x, x_mask])
print("x_one_hot:", x_one_hot)
x_prior = ScaleShift()(x_one_hot) # 学习输出的先验分布(标题的字词很可能在文章出现过)
print("x_prior:",x_prior)
embedding = Embedding(len(chars) + 4, char_size)
x = embedding(x)
y = embedding(y)
# encoder,双层双向LSTM
x = OurBidirectional(CuDNNLSTM(z_dim // 2, return_sequences=True))([x, x_mask])
x = LayerNormalization()(x)
x = OurBidirectional(CuDNNLSTM(z_dim // 2, return_sequences=True))([x, x_mask])
x = LayerNormalization()(x)
# decoder,双层单向LSTM
y = CuDNNLSTM(z_dim, return_sequences=True)(y)
y = LayerNormalization()(y)
y = CuDNNLSTM(z_dim, return_sequences=True)(y)
y = LayerNormalization()(y)
# attention交互,自注意机制q,k,v相等, 多头注意力q,k,v,q和k,v维度不一样,这里文本是摘要-文本的形式,q代表摘要
xy = Attention(8, 16)([y, x, x, x_mask])
xy = Concatenate()([y, xy])
# 输出分类
xy = Dense(char_size)(xy)
xy = Activation('relu')(xy)
xy = Dense(len(chars) + 4)(xy)
xy = Lambda(lambda x: (x[0] + x[1]) / 2)([xy, x_prior]) # 与先验结果平均
xy = Activation('softmax')(xy)
# 交叉熵作为loss,但mask掉padding部分
cross_entropy = K.sparse_categorical_crossentropy(y_in[:, 1:], xy[:, :-1])
cross_entropy = K.sum(cross_entropy * y_mask[:, 1:, 0]) / K.sum(y_mask[:, 1:, 0])
model = Model([x_in, y_in], xy)
# model.load_weights('best_model.weights')
model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-3))
return model
5.解码阶段的beam search
def gen_sent(s, model, topk=3, maxlen=64):
"""beam search解码
每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索
"""
# 输入转id
xid = np.array([str2id(s)] * topk)
# 解码均以<start>开头,这里<start>的id为2
yid = np.array([[2]] * topk)
# 候选答案分数
scores = [0] * topk
# 强制要求输出不超过maxlen字
for i in range(maxlen):
# 直接忽略<padding>、<unk>、<start>
proba = model.predict([xid, yid])[:, i, 3:]
# 取对数,方便计算
log_proba = np.log(proba + 1e-6)
# 每一项选出topk,argsort是按小到大排序,所以topk取负数
arg_topk = log_proba.argsort(axis=1)[:, -topk:]
# 暂存的候选目标序列
_yid = []
# 暂存的候选目标序列得分
_scores = []
# 第一轮只需要计算topk个值
if i == 0:
for j in range(topk):
# 第一个值的下标,下标包括了start
_yid.append(list(yid[j]) + [arg_topk[0][j] + 3])
# 第一个值的分数
_scores.append(scores[j] + log_proba[0][arg_topk[0][j]])
# 非第一轮需要计算topk^2个值
else:
# 遍历topk*topk的组合
for j in range(topk):
for k in range(topk):
_yid.append(list(yid[j]) + [arg_topk[j][k] + 3])
_scores.append(scores[j] + log_proba[j][arg_topk[j][k]])
# 从中选出新的topk
_arg_topk = np.argsort(_scores)[-topk:]
_yid = [_yid[k] for k in _arg_topk]
_scores = [_scores[k] for k in _arg_topk]
yid = np.array(_yid)
scores = np.array(_scores)
# 输出满足条件的下标
ends = np.where(yid[:, -1] == 3)[0]
if len(ends) > 0:
k = ends[scores[ends].argmax()]
return id2str(yid[k])
# 如果maxlen字都找不到<end>,直接返回
return id2str(yid[np.argmax(scores)])
6.预测评估
class Evaluate(Callback):
def __init__(self, model):
self.lowest = 1e10
self.model = model
def on_epoch_end(self, epoch, logs=None):
# 训练过程中观察一两个例子,显示标题质量提高的过程
print(gen_sent(s1, self.model))
print(gen_sent(s2, self.model))
summarization = test_df['summarization'].values
text = test_df['text'].values
pred = []
for t, s in tqdm(zip(text, summarization)):
pred.append(gen_sent(t, self.model))
rouge_1 = rouge.Rouge().get_scores(pred, summarization.tolist())[0]['rouge-1']['f']
print('rouge-1:', rouge_1)
# 保存最优结果
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('best_model.weights')
4.Bert预训练模型
阅读原论文:https://arxiv.org/abs/1810.04805
下载bert4keras工具
下载bert预训练模型中chinese_L-12_H-768_A-12,在此预训练模型上面进行fine-tune:
bert_config.json
bert_model.ckpt.meta
bert_model.ckpt.data-00000-of-00001
bert_model.ckpt.index
vocab.txt
数据准备:
将训练数据的所有字和标点符号存放在seq2seq_config.json,过滤不在bert词典中的字和符号
config_path = 'seq2seq_config.json'
min_count = 32
max_input_len = 450
max_output_len = 32
batch_size = 4
steps_per_epoch = 1000
epochs = 10000
def read_text():
df = pd.read_csv('data/train.csv')
text = df['text'].values
summarization = df['summarization'].values
for t, s in zip(text, summarization):
#摘要最大长度max_output_len
if len(s) <= max_output_len:
#文本最大长度max_input_len
yield t[:max_input_len], s
if os.path.exists(config_path):
chars = json.load(open(config_path, encoding='utf-8'))
else:
chars = {}
for a in tqdm(read_text(), desc='构建字表中'):
for b in a:
for w in b:
chars[w] = chars.get(w, 0) + 1
chars = [(i, j) for i, j in chars.items() if j >= min_count]
# chars = [(i, j) for i, j in chars.items()]
chars = sorted(chars, key=lambda c: - c[1])
chars = [c[0] for c in chars]
json.dump(
chars,
codecs.open(config_path, 'w', encoding='utf-8'),
indent=4,
ensure_ascii=False
)
config_path = 'chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'chinese_L-12_H-768_A-12/vocab.txt'
# 读取词典
_token_dict = load_vocab(dict_path)
# keep_words是在bert中保留的字表
# token_dict word:index
token_dict, keep_words = {}, []
for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
token_dict[c] = len(token_dict)
keep_words.append(_token_dict[c])
for c in chars:
if c in _token_dict:
token_dict[c] = len(token_dict)
keep_words.append(_token_dict[c])
# 建立分词器
tokenizer = SimpleTokenizer(token_dict)
分词器:
直接将文本分割为单字符序列, 专为中文处理设计,原则上只适用于中文模型。
class SimpleTokenizer:
def __init__(self, token_dict):
"""初始化词典
"""
self._token_dict = token_dict
self._token_dict_inv = {v: k for k, v in token_dict.items()}
def _is_space(self, c):
"""判断是否为空格
"""
return c == ' ' or c == '\n' or c == '\r' or c == '\t'
def _is_special(self, c):
"""判断是否带方括号的特殊标记
"""
return bool(c) and (c[0] == '[') and (c[-1] == ']')
def tokenize(self, text, add_cls=True, add_sep=True):
"""按字分割
"""
R = []
if add_cls:
R.append('[CLS]')
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
if add_sep:
R.append('[SEP]')
return R
def encode(self, first, second=None, first_length=None):
"""输出文本对应token id和segment id
如果传入first_length,则强行padding第一个句子到指定长度
"""
token_ids, segment_ids = [], []
token_ids.extend([self._token_dict[c] for c in self.tokenize(first)])
segment_ids.extend([0] * (len(first) + 2))
if first_length is not None and len(token_ids) < first_length + 2:
token_ids.extend([0] * (first_length + 2 - len(token_ids)))
segment_ids.extend([0] * (first_length + 2 - len(segment_ids)))
if second is not None:
token_ids.extend([
self._token_dict[c]
for c in self.tokenize(second, add_cls=False)
])
segment_ids.extend([1] * (len(second) + 1))
return token_ids, segment_ids
def decode(self, token_ids, join_str=''):
"""简单的词id序列转文本函数
"""
tokens = []
for i in token_ids:
t = self._token_dict_inv.get(i, '')
if t == '[unused1]':
tokens.append(' ')
elif not self._is_special(t):
tokens.append(t)
return join_str.join(tokens)
加载模型
def load_pretrained_model(config_path,
checkpoint_file=None,
with_mlm=False,
seq2seq=False,
keep_words=None,
albert=False):
"""根据配置文件和checkpoint文件来加载模型
"""
config = json.load(open(config_path))
if seq2seq:
Bert = Bert4Seq2seq
else:
Bert = BertModel
bert = Bert(vocab_size=config['vocab_size'],
max_position_embeddings=config['max_position_embeddings'],
hidden_size=config['hidden_size'],
num_hidden_layers=config['num_hidden_layers'],
num_attention_heads=config['num_attention_heads'],
intermediate_size=config['intermediate_size'],
hidden_act=config['hidden_act'],
dropout_rate=config['hidden_dropout_prob'],
embedding_size=config.get('embedding_size'),
with_mlm=with_mlm,
keep_words=keep_words,
block_sharing=albert)
bert.build()
if checkpoint_file is not None:
bert.load_weights_from_checkpoint(checkpoint_file)
return bert.model
model = load_pretrained_model(
config_path,
checkpoint_path,
seq2seq=True,
keep_words=keep_words, # 只保留keep_words中的字,精简原字表
)
# 交叉熵作为loss,并mask掉输入部分的预测
# 目标tokens
#起始是[cls]
y_in = model.input[0][:, 1:]
y_mask = model.input[1][:, 1:]
# 预测tokens,预测与目标错开一位,
#末尾是[sep]
y = model.output[:, :-1]
#y输入和输出的交叉熵
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))
model.load_weights('./best_model2.weights')
model.fit_generator(
data_generator(),
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)