bert实践2---NER

数据集:2014人民日报.zip
源码:https://github.com/circlePi/Bert_Chinese_Ner_pytorch

# B_PER代表人名的开头,B_T代表时间的开头,B_ORG代表组织的开头,B_LOC代表位置的开头
# I_*代表不是开头的中间词语,O是标签,无意义
labels = ["B_PER", "I_PER", "B_T", "I_T", "B_ORG", "I_ORG", "B_LOC", "I_LOC", "O"]

使用BERT得到len(label)的输出,然后使用CRF来进行计算损失函数。
原理:BERT+CRF

  1. 数据处理
    原数据已经进行的单个的切分和标记,所以不需要进行切分,只需对数据进行组装,构成训练数据
    data:
    在这里插入图片描述
    label:
    在这里插入图片描述

  2. 构建训练数据:
    构建格式如下:
    在这里插入图片描述
    需要注意的是output_mask的处理是将[CLS],[SEP]处置为0,为了CRF的操作更准确。

  3. 模型
    模型定义:

import torch.nn as nn
from net.crf import CRF
import numpy as np
from sklearn.metrics import f1_score, classification_report
import config.args as args
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel


class Bert_CRF(BertPreTrainedModel):
    def __init__(self,
                 config,
                 num_tag):
        super(Bert_CRF, self).__init__(config)
        # bert模型
        self.bert = BertModel(config)
        # for p in self.bert.parameters():
        #     p.requires_grad = False
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 将输出进行映射
        self.classifier = nn.Linear(config.hidden_size, num_tag)
        self.apply(self.init_bert_weights)

        self.crf = CRF(num_tag)  # CRF层,用于计算损失函数

    def forward(self,
                input_ids,
                token_type_ids,
                attention_mask,
                label_id=None,
                output_all_encoded_layers=False):
        bert_encode, _ = self.bert(input_ids, token_type_ids, attention_mask,
        	output_all_encoded_layers=output_all_encoded_layers)

        # bert_encode: (batch_size, seq_len, config.hidden_size)
        output = self.classifier(bert_encode)
        return output

    # CRF的损失函数计算
    def loss_fn(self, bert_encode, output_mask, tags):
        loss = self.crf.negative_log_loss(bert_encode, output_mask, tags)
        return loss

    def predict(self, bert_encode, output_mask):
        predicts = self.crf.get_batch_best_path(bert_encode, output_mask)
        predicts = predicts.view(1, -1).squeeze()
        predicts = predicts[predicts != -1]
        return predicts

    def acc_f1(self, y_pred, y_true):
        y_pred = y_pred.numpy()
        y_true = y_true.numpy()
        f1 = f1_score(y_true, y_pred, average="macro")
        correct = np.sum((y_true==y_pred).astype(int))
        acc = correct/y_pred.shape[0]
        return acc, f1

    def class_report(self, y_pred, y_true):
        y_true = y_true.numpy()
        y_pred = y_pred.numpy()
        classify_report = classification_report(y_true, y_pred)
        print('\n\nclassify_report:\n', classify_report)

注意重要的是计算损失函数的方式,不是简单的CrossEntropy,而是对CRF的发射矩阵进行训练与计算,CRF的定义如下:

import torch
import torch.nn as nn
from torch.autograd import Variable


class CRF(nn.Module):
    """线性条件随机场"""
    def __init__(self, num_tag, use_cuda=False):
        if num_tag <= 0:
            raise ValueError("Invalid value of num_tag: %d" % num_tag)
        super(CRF, self).__init__()
        self.num_tag = num_tag
        self.start_tag = num_tag
        self.end_tag = num_tag + 1
        self.use_cuda = use_cuda
        # 转移矩阵transitions:P_jk 表示从tag_j到tag_k的概率
        # P_j* 表示所有从tag_j出发的边
        # P_*k 表示所有到tag_k的边
        self.transitions = nn.Parameter(torch.Tensor(num_tag + 2, num_tag + 2))
        nn.init.uniform_(self.transitions, -0.1, 0.1)
        # 表示从EOS->其他标签为不可能事件, 如果发生,则产生一个极大的损失
        self.transitions.data[self.end_tag, :] = -10000   
         # 表示从其他标签->SOS为不可能事件, 同上
        self.transitions.data[:, self.start_tag] = -10000  

    def real_path_score(self, features, tags):
        """
        features: (time_steps, num_tag)
        real_path_score表示真实路径分数
        它由Emission score和Transition score两部分相加组成
        Emission score由LSTM输出结合真实的tag决定,表示我们希望由输出得到真实的标签
        Transition score则是crf层需要进行训练的参数,它是随机初始化的,
        	表示标签序列前后间的约束关系(转移概率)
        Transition矩阵存储的是标签序列相互间的约束关系
        在训练的过程中,希望real_path_score最高,因为这是所有路径中最可能的路径
        """
        r = torch.LongTensor(range(features.size(0)))
        if self.use_cuda:
            pad_start_tags = torch.cat(
            	[torch.cuda.LongTensor([self.start_tag]), tags])
            pad_stop_tags = torch.cat(
            	[tags, torch.cuda.LongTensor([self.end_tag])])
            r = r.cuda()
        else:
            pad_start_tags = torch.cat([torch.LongTensor([self.start_tag]), tags])
            pad_stop_tags = torch.cat([tags, torch.LongTensor([self.end_tag])])

        # Transition score + Emission score
        score = torch.sum(self.transitions[pad_start_tags, pad_stop_tags]).cpu() + 
        	torch.sum(features[r, tags])
        return score

    def all_possible_path_score(self, features):
        """
        计算所有可能的路径分数的log和:前向算法
        step1: 将forward列expand成3*3
        step2: 将下个单词的emission行expand成3*3
        step3: 将1和2和对应位置的转移矩阵相加
        step4: 更新forward,合并行
        step5: 取forward指数的对数计算total
        """
        time_steps = features.size(0)
        # 初始化START_TAG的发射分数为0
        forward = Variable(torch.zeros(self.num_tag))      
        if self.use_cuda:
            forward = forward.cuda()
        
        # START_TAG -> 1st word -> 2nd word ->...->END_TAG
        for i in range(0, time_steps):  
            emission_start = forward.expand(self.num_tag, self.num_tag).t()
            emission_end = features[i,:].expand(self.num_tag, self.num_tag)
            if i == 0:
                trans_score = self.transitions[self.start_tag, \
                	:self.start_tag].cpu()
            else:
                trans_score = self.transitions[:self.start_tag, \
                	:self.start_tag].cpu()
            sum = emission_start + emission_end + trans_score
            forward = log_sum(sum, dim=0)
        forward = forward + 
        	self.transitions[:self.start_tag, self.end_tag].cpu()  # END_TAG
        total_score = log_sum(forward, dim=0)
        return total_score

    def negative_log_loss(self, inputs, output_mask, tags):
        """
        inputs:(batch_size, time_step, num_tag)
        target_function = P_real_path_score/P_all_possible_path_score
                        = exp(S_real_path_score)/ sum(exp(certain_path_score))
        我们希望P_real_path_score的概率越高越好,即target_function的值越大越好
        因此,loss_function取其相反数,越小越好
        loss_function = -log(target_function)
                      = -S_real_path_score + log(exp(S_1 + exp(S_2) + ...))
                      = -S_real_path_score + log(all_possible_path_score)
        """
        if not self.use_cuda:
            inputs = inputs.cpu()
            output_mask = output_mask.cpu()
            tags = tags.cpu()

        loss = Variable(torch.tensor(0.), requires_grad=True)
        num_tag = inputs.size(2)
        num_chars = torch.sum(output_mask.detach()).float()
        for ix, (features, tag) in enumerate(zip(inputs, tags)):
            # 过滤[CLS] [SEP] sub_word
            # features (time_steps, num_tag)
            # output_mask (batch_size, time_step)
            num_valid = torch.sum(output_mask[ix].detach())
            features = features[output_mask[ix]==1]
            tag = tag[:num_valid]
            real_score = self.real_path_score(features, tag)
            total_score = self.all_possible_path_score(features)
            cost = total_score - real_score
            loss  = loss + cost
        return loss/num_chars

    def viterbi(self, features):
        time_steps = features.size(0)
        forward = Variable(torch.zeros(self.num_tag))  # START_TAG
        if self.use_cuda:
            forward = forward.cuda()
        # back_points 到该点的最大分数  last_points 前一个点的索引
        back_points, index_points = [self.transitions[self.start_tag, \
        	:self.start_tag].cpu()], [torch.LongTensor([-1]).expand_as(forward)]
        
         # START_TAG -> 1st word -> 2nd word ->...->END_TAG
        for i in range(1, time_steps): 
            emission_start = forward.expand(self.num_tag, self.num_tag).t()
            emission_end = features[i,:].expand(self.num_tag, self.num_tag)
            trans_score = self.transitions[:self.start_tag, \
            	:self.start_tag].cpu()
            sum = emission_start + emission_end + trans_score
            forward, index = torch.max(sum.detach(), dim=0)
            back_points.append(forward)
            index_points.append(index)
        back_points.append(forward + \
        	self.transitions[:self.start_tag, self.end_tag].cpu())  # END_TAG
        return back_points, index_points

    def get_best_path(self, features):
        back_points, index_points = self.viterbi(features)
        # 找到线头
        best_last_point = argmax(back_points[-1])
        index_points = torch.stack(index_points)   # 堆成矩阵
        m = index_points.size(0)
        # 初始化矩阵
        best_path = [best_last_point]
        # 循着线头找到其对应的最佳路径
        for i in range(m-1, 0, -1):
            best_index_point = index_points[i][best_last_point]
            best_path.append(best_index_point)
            best_last_point = best_index_point
        best_path.reverse()
        return best_path

    def get_batch_best_path(self, inputs, output_mask):
        if not self.use_cuda:
            inputs = inputs.cpu()
            output_mask = output_mask.cpu()
        batch_best_path = []
        max_len = inputs.size(1)
        num_tag = inputs.size(2)
        for ix, features in enumerate(inputs):
            features = features[output_mask[ix]==1]
            best_path = self.get_best_path(features)
            best_path = torch.Tensor(best_path).long()
            best_path = padding(best_path, max_len)
            batch_best_path.append(best_path)
        batch_best_path = torch.stack(batch_best_path, dim=0)
        return batch_best_path


def log_sum(matrix, dim):
    """
    前向算法是不断累积之前的结果,这样就会有个缺点
    指数和累积到一定程度后,会超过计算机浮点值的最大值,变成inf,这样取log后也是inf
    为了避免这种情况,我们做了改动:
    1. 用一个合适的值clip去提指数和的公因子,这样就不会使某项变得过大而无法计算
    SUM = log(exp(s1)+exp(s2)+...+exp(s100))
        = log{exp(clip)*[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]}
        = clip + log[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]
    where clip=max
    """
    clip_value = torch.max(matrix)                 # 极大值
    clip_value = int(clip_value.data.tolist())
    log_sum_value = clip_value + \
    	torch.log(torch.sum(torch.exp(matrix-clip_value), dim=dim))
    return log_sum_value


def argmax(matrix, dim=0):
    """(0.5, 0.4, 0.3)"""
    _, index = torch.max(matrix, dim=dim)
    return index


def padding(vec, max_len, pad_token=-1):
    new_vec = torch.zeros(max_len).long()
    new_vec[:vec.size(0)] = vec
    new_vec[vec.size(0):] = pad_token
    return new_vec

小技巧需要学习的地方,log_sum的处理。采用viterbi(感觉就是动归)进行计算best-path的输出。

最后训练:(没啥可说的)

for e in range(num_epoch):
        model.train()
        for step, batch in enumerate(training_iter):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, output_mask = batch
            # print("input_id", input_ids)
            # print("input_mask", input_mask)
            # print("segment_id", segment_ids)
            bert_encode = model(input_ids, segment_ids, input_mask).cpu()
            train_loss = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)

            if args.gradient_accumulation_steps > 1:
                train_loss = train_loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(train_loss)
            else:
                train_loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            predicts = model.predict(bert_encode, output_mask)
            label_ids = label_ids.view(1, -1)
            label_ids = label_ids[label_ids != -1]
            label_ids = label_ids.cpu()

            train_acc, f1 = model.acc_f1(predicts, label_ids)
            pbar.show_process(train_acc, train_loss.item(), f1, time.time() - start, step)

# -----------------------验证----------------------------
        model.eval()
        count = 0
        y_predicts, y_labels = [], []
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            for step, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, output_mask = batch
                bert_encode = model(input_ids, segment_ids, input_mask).cpu()
                eval_los = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)
                eval_loss = eval_los + eval_loss
                count += 1
                predicts =  model.predict(bert_encode, output_mask)
                y_predicts.append(predicts)

                label_ids = label_ids.view(1, -1)
                label_ids = label_ids[label_ids != -1]
                y_labels.append(label_ids)


            eval_predicted = torch.cat(y_predicts, dim=0).cpu()
            eval_labeled = torch.cat(y_labels, dim=0).cpu()

            eval_acc, eval_f1 = model.acc_f1(eval_predicted, eval_labeled)
            model.class_report(eval_predicted, eval_labeled)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1,
                   train_loss.item(),
                   eval_loss.item()/count,
                   train_acc,
                   eval_acc,
                   eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(model, args.output_dir)

            if e % verbose == 0:
                train_losses.append(train_loss.item())
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss.item()/count)
                eval_accuracy.append(eval_acc)

改天可能需要补一篇HMM和CRF的介绍,梳理一下。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值