数据集:2014人民日报.zip
源码:https://github.com/circlePi/Bert_Chinese_Ner_pytorch
# B_PER代表人名的开头,B_T代表时间的开头,B_ORG代表组织的开头,B_LOC代表位置的开头
# I_*代表不是开头的中间词语,O是标签,无意义
labels = ["B_PER", "I_PER", "B_T", "I_T", "B_ORG", "I_ORG", "B_LOC", "I_LOC", "O"]
使用BERT得到len(label)的输出,然后使用CRF来进行计算损失函数。
原理:BERT+CRF
-
数据处理
原数据已经进行的单个的切分和标记,所以不需要进行切分,只需对数据进行组装,构成训练数据
data:
label:
-
构建训练数据:
构建格式如下:
需要注意的是output_mask的处理是将[CLS],[SEP]处置为0,为了CRF的操作更准确。 -
模型
模型定义:
import torch.nn as nn
from net.crf import CRF
import numpy as np
from sklearn.metrics import f1_score, classification_report
import config.args as args
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
class Bert_CRF(BertPreTrainedModel):
def __init__(self,
config,
num_tag):
super(Bert_CRF, self).__init__(config)
# bert模型
self.bert = BertModel(config)
# for p in self.bert.parameters():
# p.requires_grad = False
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 将输出进行映射
self.classifier = nn.Linear(config.hidden_size, num_tag)
self.apply(self.init_bert_weights)
self.crf = CRF(num_tag) # CRF层,用于计算损失函数
def forward(self,
input_ids,
token_type_ids,
attention_mask,
label_id=None,
output_all_encoded_layers=False):
bert_encode, _ = self.bert(input_ids, token_type_ids, attention_mask,
output_all_encoded_layers=output_all_encoded_layers)
# bert_encode: (batch_size, seq_len, config.hidden_size)
output = self.classifier(bert_encode)
return output
# CRF的损失函数计算
def loss_fn(self, bert_encode, output_mask, tags):
loss = self.crf.negative_log_loss(bert_encode, output_mask, tags)
return loss
def predict(self, bert_encode, output_mask):
predicts = self.crf.get_batch_best_path(bert_encode, output_mask)
predicts = predicts.view(1, -1).squeeze()
predicts = predicts[predicts != -1]
return predicts
def acc_f1(self, y_pred, y_true):
y_pred = y_pred.numpy()
y_true = y_true.numpy()
f1 = f1_score(y_true, y_pred, average="macro")
correct = np.sum((y_true==y_pred).astype(int))
acc = correct/y_pred.shape[0]
return acc, f1
def class_report(self, y_pred, y_true):
y_true = y_true.numpy()
y_pred = y_pred.numpy()
classify_report = classification_report(y_true, y_pred)
print('\n\nclassify_report:\n', classify_report)
注意重要的是计算损失函数的方式,不是简单的CrossEntropy,而是对CRF的发射矩阵进行训练与计算,CRF的定义如下:
import torch
import torch.nn as nn
from torch.autograd import Variable
class CRF(nn.Module):
"""线性条件随机场"""
def __init__(self, num_tag, use_cuda=False):
if num_tag <= 0:
raise ValueError("Invalid value of num_tag: %d" % num_tag)
super(CRF, self).__init__()
self.num_tag = num_tag
self.start_tag = num_tag
self.end_tag = num_tag + 1
self.use_cuda = use_cuda
# 转移矩阵transitions:P_jk 表示从tag_j到tag_k的概率
# P_j* 表示所有从tag_j出发的边
# P_*k 表示所有到tag_k的边
self.transitions = nn.Parameter(torch.Tensor(num_tag + 2, num_tag + 2))
nn.init.uniform_(self.transitions, -0.1, 0.1)
# 表示从EOS->其他标签为不可能事件, 如果发生,则产生一个极大的损失
self.transitions.data[self.end_tag, :] = -10000
# 表示从其他标签->SOS为不可能事件, 同上
self.transitions.data[:, self.start_tag] = -10000
def real_path_score(self, features, tags):
"""
features: (time_steps, num_tag)
real_path_score表示真实路径分数
它由Emission score和Transition score两部分相加组成
Emission score由LSTM输出结合真实的tag决定,表示我们希望由输出得到真实的标签
Transition score则是crf层需要进行训练的参数,它是随机初始化的,
表示标签序列前后间的约束关系(转移概率)
Transition矩阵存储的是标签序列相互间的约束关系
在训练的过程中,希望real_path_score最高,因为这是所有路径中最可能的路径
"""
r = torch.LongTensor(range(features.size(0)))
if self.use_cuda:
pad_start_tags = torch.cat(
[torch.cuda.LongTensor([self.start_tag]), tags])
pad_stop_tags = torch.cat(
[tags, torch.cuda.LongTensor([self.end_tag])])
r = r.cuda()
else:
pad_start_tags = torch.cat([torch.LongTensor([self.start_tag]), tags])
pad_stop_tags = torch.cat([tags, torch.LongTensor([self.end_tag])])
# Transition score + Emission score
score = torch.sum(self.transitions[pad_start_tags, pad_stop_tags]).cpu() +
torch.sum(features[r, tags])
return score
def all_possible_path_score(self, features):
"""
计算所有可能的路径分数的log和:前向算法
step1: 将forward列expand成3*3
step2: 将下个单词的emission行expand成3*3
step3: 将1和2和对应位置的转移矩阵相加
step4: 更新forward,合并行
step5: 取forward指数的对数计算total
"""
time_steps = features.size(0)
# 初始化START_TAG的发射分数为0
forward = Variable(torch.zeros(self.num_tag))
if self.use_cuda:
forward = forward.cuda()
# START_TAG -> 1st word -> 2nd word ->...->END_TAG
for i in range(0, time_steps):
emission_start = forward.expand(self.num_tag, self.num_tag).t()
emission_end = features[i,:].expand(self.num_tag, self.num_tag)
if i == 0:
trans_score = self.transitions[self.start_tag, \
:self.start_tag].cpu()
else:
trans_score = self.transitions[:self.start_tag, \
:self.start_tag].cpu()
sum = emission_start + emission_end + trans_score
forward = log_sum(sum, dim=0)
forward = forward +
self.transitions[:self.start_tag, self.end_tag].cpu() # END_TAG
total_score = log_sum(forward, dim=0)
return total_score
def negative_log_loss(self, inputs, output_mask, tags):
"""
inputs:(batch_size, time_step, num_tag)
target_function = P_real_path_score/P_all_possible_path_score
= exp(S_real_path_score)/ sum(exp(certain_path_score))
我们希望P_real_path_score的概率越高越好,即target_function的值越大越好
因此,loss_function取其相反数,越小越好
loss_function = -log(target_function)
= -S_real_path_score + log(exp(S_1 + exp(S_2) + ...))
= -S_real_path_score + log(all_possible_path_score)
"""
if not self.use_cuda:
inputs = inputs.cpu()
output_mask = output_mask.cpu()
tags = tags.cpu()
loss = Variable(torch.tensor(0.), requires_grad=True)
num_tag = inputs.size(2)
num_chars = torch.sum(output_mask.detach()).float()
for ix, (features, tag) in enumerate(zip(inputs, tags)):
# 过滤[CLS] [SEP] sub_word
# features (time_steps, num_tag)
# output_mask (batch_size, time_step)
num_valid = torch.sum(output_mask[ix].detach())
features = features[output_mask[ix]==1]
tag = tag[:num_valid]
real_score = self.real_path_score(features, tag)
total_score = self.all_possible_path_score(features)
cost = total_score - real_score
loss = loss + cost
return loss/num_chars
def viterbi(self, features):
time_steps = features.size(0)
forward = Variable(torch.zeros(self.num_tag)) # START_TAG
if self.use_cuda:
forward = forward.cuda()
# back_points 到该点的最大分数 last_points 前一个点的索引
back_points, index_points = [self.transitions[self.start_tag, \
:self.start_tag].cpu()], [torch.LongTensor([-1]).expand_as(forward)]
# START_TAG -> 1st word -> 2nd word ->...->END_TAG
for i in range(1, time_steps):
emission_start = forward.expand(self.num_tag, self.num_tag).t()
emission_end = features[i,:].expand(self.num_tag, self.num_tag)
trans_score = self.transitions[:self.start_tag, \
:self.start_tag].cpu()
sum = emission_start + emission_end + trans_score
forward, index = torch.max(sum.detach(), dim=0)
back_points.append(forward)
index_points.append(index)
back_points.append(forward + \
self.transitions[:self.start_tag, self.end_tag].cpu()) # END_TAG
return back_points, index_points
def get_best_path(self, features):
back_points, index_points = self.viterbi(features)
# 找到线头
best_last_point = argmax(back_points[-1])
index_points = torch.stack(index_points) # 堆成矩阵
m = index_points.size(0)
# 初始化矩阵
best_path = [best_last_point]
# 循着线头找到其对应的最佳路径
for i in range(m-1, 0, -1):
best_index_point = index_points[i][best_last_point]
best_path.append(best_index_point)
best_last_point = best_index_point
best_path.reverse()
return best_path
def get_batch_best_path(self, inputs, output_mask):
if not self.use_cuda:
inputs = inputs.cpu()
output_mask = output_mask.cpu()
batch_best_path = []
max_len = inputs.size(1)
num_tag = inputs.size(2)
for ix, features in enumerate(inputs):
features = features[output_mask[ix]==1]
best_path = self.get_best_path(features)
best_path = torch.Tensor(best_path).long()
best_path = padding(best_path, max_len)
batch_best_path.append(best_path)
batch_best_path = torch.stack(batch_best_path, dim=0)
return batch_best_path
def log_sum(matrix, dim):
"""
前向算法是不断累积之前的结果,这样就会有个缺点
指数和累积到一定程度后,会超过计算机浮点值的最大值,变成inf,这样取log后也是inf
为了避免这种情况,我们做了改动:
1. 用一个合适的值clip去提指数和的公因子,这样就不会使某项变得过大而无法计算
SUM = log(exp(s1)+exp(s2)+...+exp(s100))
= log{exp(clip)*[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]}
= clip + log[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]
where clip=max
"""
clip_value = torch.max(matrix) # 极大值
clip_value = int(clip_value.data.tolist())
log_sum_value = clip_value + \
torch.log(torch.sum(torch.exp(matrix-clip_value), dim=dim))
return log_sum_value
def argmax(matrix, dim=0):
"""(0.5, 0.4, 0.3)"""
_, index = torch.max(matrix, dim=dim)
return index
def padding(vec, max_len, pad_token=-1):
new_vec = torch.zeros(max_len).long()
new_vec[:vec.size(0)] = vec
new_vec[vec.size(0):] = pad_token
return new_vec
小技巧需要学习的地方,log_sum的处理。采用viterbi(感觉就是动归)进行计算best-path的输出。
最后训练:(没啥可说的)
for e in range(num_epoch):
model.train()
for step, batch in enumerate(training_iter):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids, output_mask = batch
# print("input_id", input_ids)
# print("input_mask", input_mask)
# print("segment_id", segment_ids)
bert_encode = model(input_ids, segment_ids, input_mask).cpu()
train_loss = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)
if args.gradient_accumulation_steps > 1:
train_loss = train_loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(train_loss)
else:
train_loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0:
# modify learning rate with special warm up BERT uses
lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
predicts = model.predict(bert_encode, output_mask)
label_ids = label_ids.view(1, -1)
label_ids = label_ids[label_ids != -1]
label_ids = label_ids.cpu()
train_acc, f1 = model.acc_f1(predicts, label_ids)
pbar.show_process(train_acc, train_loss.item(), f1, time.time() - start, step)
# -----------------------验证----------------------------
model.eval()
count = 0
y_predicts, y_labels = [], []
eval_loss, eval_acc, eval_f1 = 0, 0, 0
with torch.no_grad():
for step, batch in enumerate(eval_iter):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids, output_mask = batch
bert_encode = model(input_ids, segment_ids, input_mask).cpu()
eval_los = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)
eval_loss = eval_los + eval_loss
count += 1
predicts = model.predict(bert_encode, output_mask)
y_predicts.append(predicts)
label_ids = label_ids.view(1, -1)
label_ids = label_ids[label_ids != -1]
y_labels.append(label_ids)
eval_predicted = torch.cat(y_predicts, dim=0).cpu()
eval_labeled = torch.cat(y_labels, dim=0).cpu()
eval_acc, eval_f1 = model.acc_f1(eval_predicted, eval_labeled)
model.class_report(eval_predicted, eval_labeled)
logger.info(
'\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
% (e + 1,
train_loss.item(),
eval_loss.item()/count,
train_acc,
eval_acc,
eval_f1))
# 保存最好的模型
if eval_f1 > best_f1:
best_f1 = eval_f1
save_model(model, args.output_dir)
if e % verbose == 0:
train_losses.append(train_loss.item())
train_accuracy.append(train_acc)
eval_losses.append(eval_loss.item()/count)
eval_accuracy.append(eval_acc)
改天可能需要补一篇HMM和CRF的介绍,梳理一下。