Pattern-Exploiting Training MLM任务用于文本匹配【代码解读】

75 篇文章 7 订阅
61 篇文章 2 订阅

一、总结
• 原文:

# PET-文本分类的又一种妙解:https://xv44586.github.io/2020/10/25/pet/
# ccf问答匹配比赛(下):如何只用“bert”夺冠:https://xv44586.github.io/2021/01/20/ccf-qa-2/

在这里插入图片描述

三、代码注释

原始链接:https://github.com/xv44586/ccf_2020_qa_match

# -*- coding: utf-8 -*-
# @Date    : 2020/11/4
# @Author  : mingming.xu
# @Email   : xv44586@gmail.com
# @File    : ccf_2020_qa_match_pet.py
"""
Pattern-Exploiting Training(PET): 增加pattern,将任务转换为MLM任务。
线上f1: 0.761

tips:
  切换模型时,修改对应config_path/checkpoint_path/dict_path路径以及build_transformer_model 内的参数
"""

import os
import numpy as np
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
from toolkit4nlp.backend import keras, K
from toolkit4nlp.tokenizers import Tokenizer, load_vocab
from toolkit4nlp.models import build_transformer_model, Model
from toolkit4nlp.optimizers import *
from toolkit4nlp.utils import pad_sequences, DataGenerator
from toolkit4nlp.layers import *

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# PET-文本分类的又一种妙解:https://xv44586.github.io/2020/10/25/pet/
# ccf问答匹配比赛(下):如何只用“bert”夺冠:https://xv44586.github.io/2021/01/20/ccf-qa-2/
num_classes = 32
maxlen = 128
batch_size = 8

# BERT base

config_path = 'data/pretrained/nezha/NEZHA-Base/bert_config.json'
checkpoint_path = 'data/pretrained/nezha/NEZHA-Base/model.ckpt-900000'
dict_path = 'data/pretrained/nezha/NEZHA-Base/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)

# pattern
pattern = '下面两个句子的语义相似度较高:'
# tokenizer.encode的第一个位置是cls,所以mask的index要+1
tokens = ["CLS"]+list(pattern)
print(tokens[14])
mask_idx = [14]

id2label = {
    0: '低',
    1: '高'
}

label2id = {v: k for k, v in id2label.items()}
print('label2id:',label2id)#label2id: {'低': 0, '高': 1}
labels = list(id2label.values())
print('labels:',labels)#labels: ['低', '高']
# labels在token中的ids,encode的时候,第一个数是cls,所以取encode输出的tokens[1:-1],代表跳过了cls的
label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels])
print('label_ids:',label_ids)#label_ids: [[ 856] [7770]]

# 这里本文其实没有用到
def random_masking(token_ids):
    """对输入进行随机mask
    """
    # n个随机数
    rands = np.random.random(len(token_ids))
    source, target = [], []
    for r, t in zip(rands, token_ids):
        # [mask, 0.15 * 0.8, t(本身), 0.15 * 0.9, 随机, 0.15, 本身,target=0,其余target都为1]
        if r < 0.15 * 0.8:
            # 通过mask来预测target
            source.append(tokenizer._token_mask_id)
            target.append(t)
        elif r < 0.15 * 0.9:
            # 通过本身来预测target
            source.append(t)
            target.append(t)
        elif r < 0.15:
            # 通过随机token来预测target
            source.append(np.random.choice(tokenizer._vocab_size - 1) + 1)
            target.append(t)
        else:
            # 通过本身->label=0?
            source.append(t)
            target.append(0)
    return source, target


class data_generator(DataGenerator):
    def __init__(self, prefix=False, *args, **kwargs):
        super(data_generator, self).__init__(*args, **kwargs)
        self.prefix = prefix

    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_target_ids = [], [], []
        # 拿到query和reply
        for is_end, (q, r, label) in self.get_sample(shuffle):
            # 没有label的时候定义为None
            label = int(label) if label is not None else None
            # 有label的时候,才添加前缀
            if label is not None or self.prefix:
                q = pattern + q
            # 拿到token_ids和segment_id
            token_ids, segment_ids = tokenizer.encode(q, r, maxlen=maxlen)
            # 本文没有用到这个
            if shuffle:
                # 这里做了随机mask,随机mask有点没看懂, 但是本文都没用到这个
                source_tokens, target_tokens = random_masking(token_ids)
            else:
                # 理论上target_tokens就等于source_tokens
                source_tokens, target_tokens = token_ids[:], token_ids[:]
            # mask label
            if label is not None:
                # 将label转化成token,因为是mlm任务,最终的label其实就是token
                label_ids = tokenizer.encode(id2label[label])[0][1:-1]
                # pattern = '直接回答问题:'
                # mask_idx = [1]
                # 这里label_ids也只有一个,所以是直接复制
                # mask_idx代表的其实是label在原文中的位置
                for m, lb in zip(mask_idx, label_ids):
                    # 这里相当于把原文的label更换成为mask_id
                    # source_tokens[1] = mask_id
                    # 然后target_tokens[1] = label_id(也就是label对应的token_id)
                    # 这里只更改了label对应的token,其余部分不变
                    source_tokens[m] = tokenizer._token_mask_id
                    target_tokens[m] = lb
            elif self.prefix:
                # 这里就一个mask_id,如果有多个多个都直接赋值成为token_id
                for i in mask_idx:
                    source_tokens[i] = tokenizer._token_mask_id
            # 最后拿到mlm任务的source_tokens,segment_ids,target_tokens
            batch_token_ids.append(source_tokens)
            batch_segment_ids.append(segment_ids)
            batch_target_ids.append(target_tokens)

            if is_end or len(batch_token_ids) == self.batch_size:
                # 满足batch_size要求了,把他yield出去
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_target_ids = pad_sequences(batch_target_ids)
                # batch_target_ids是每个位置target的id
                yield [batch_token_ids, batch_segment_ids, batch_target_ids], None
                # 将原始的batch里面的内容置为空
                batch_token_ids, batch_segment_ids, batch_target_ids = [], [], []


class CrossEntropy(Loss):
    """交叉熵作为loss,并mask掉输入部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_pred = inputs
        # K.not_equal, 拿到y_true不为0的部分,然后转化成为float
        y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
        # 计算精度
        accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        # mask掉输入部分
        accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask)
        # 拿到acc精度
        self.add_metric(accuracy, name='accuracy')
        # 拿到交叉熵
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        # mask
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss


# tokenizer
# tokenizer = Tokenizer(dict_path, do_lower_case=True)



def train(train_data, val_data, test_data, best_model_file, test_result_file):
    train_generator = data_generator(data=train_data + test_data, batch_size=batch_size)
    valid_generator = data_generator(data=val_data, batch_size=batch_size)
    test_generator = data_generator(data=test_data, batch_size=batch_size, prefix=True)
    target_in = Input(shape=(None,))
    model = build_transformer_model(config_path=config_path,
                                    checkpoint_path=checkpoint_path,
                                    with_mlm=True,  # with_nlm为True是不是返回的output就不一样了,应该返回的就是mlm的output
                                    # model='bert',  # 加载bert/Roberta/ernie
                                    model='nezha'
                                    )
    output = CrossEntropy(output_idx=1)([target_in, model.output])
    # 输入的时候,添加一个target_in, 输出还是和之前一样
    train_model = Model(model.inputs + [target_in], output)
    # 梯度衰减+梯度积累
    AdamW = extend_with_weight_decay(Adam)
    AdamWG = extend_with_gradient_accumulation(AdamW)
    opt = AdamWG(learning_rate=1e-5, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=4)
    train_model.compile(opt)
    train_model.summary()

    def evaluate(data):
        P, R, TP = 0., 0., 0.
        for d, _ in tqdm(data):
            x_true, y_true = d[:2], d[2]
            # 拿到预测结果,已经转化为label_ids里面的index了
            y_pred = predict(x_true)
            # 只取mask_idx对应的y -> 原始token -> 原始label中的index
            y_true = np.array([labels.index(tokenizer.decode(y)) for y in y_true[:, mask_idx]])
            # print(y_true, y_pred)
            # 计算f1
            R += y_pred.sum()
            P += y_true.sum()
            TP += ((y_pred + y_true) > 1).sum()
        print(P, R, TP)
        pre = TP / R
        rec = TP / P
        return 2 * (pre * rec) / (pre + rec)

    def predict(x):
        if len(x) == 3:
            x = x[:2]
        # 拿到mask_idx对应的output
        # todo:这里这个model为什么不是train_model啊?
        y_pred = model.predict(x)[:, mask_idx]
        # 这个维度信息不太清楚
        # batch, 0,label_ids对应的值, label_ids应该是可能有多个id,对应分类的多个类别
        y_pred = y_pred[:, 0, label_ids[:, 0]]
        # 最后是取得所有label_ids里面的最大值,得到mlm的预测结果的,这里面的mlm的预测的结果的个数与分类的label数一致
        y_pred = y_pred.argmax(axis=1)
        return y_pred

    class Evaluator(keras.callbacks.Callback):
        def __init__(self, valid_generator, best_pet_model_file="best_pet_model.weights"):
            self.best_acc = 0.
            self.valid_generator = valid_generator
            self.best_pet_model_file = best_pet_model_file

        def on_epoch_end(self, epoch, logs=None):
            acc = evaluate(self.valid_generator)
            if acc > self.best_acc:
                self.best_acc = acc
                self.model.save_weights(self.best_pet_model_file)
            print('acc :{}, best acc:{}'.format(acc, self.best_acc))

    def write_to_file(path, test_generator, test_data):
        preds = []
        # 分批预测结果
        for x, _ in tqdm(test_generator):
            pred = predict(x)
            preds.extend(pred)

        # 把原始的query,reply以及预测的p都写入到文件中
        ret = []
        for data, p in zip(test_data, preds):
            if data[2] is None:
                label = -1
            else:
                label = data[2]
            ret.append([data[0], data[1], str(label), str(p)])

        with open(path, 'w') as f:
            for r in ret:
                f.write('\t'.join(r) + '\n')

    evaluator = Evaluator(valid_generator, best_model_file)
    train_model.fit_generator(train_generator.generator(),
                              steps_per_epoch=len(train_generator),
                              epochs=10,
                              callbacks=[evaluator])

    train_model.load_weights(best_model_file)
    write_to_file(test_result_file, test_generator, test_data)

def load_pair_data(f, isshuffle=False):
    data = []
    df = pd.read_csv(f)
    if isshuffle:
        df = df.sample(frac=1.0, random_state=1234)
    columns = list(df.columns)
    if 'text_a' not in columns and 'query1' in columns:
        df.rename(columns={'query1':'text_a', 'query2':'text_b'}, inplace=True)
    for i in range(len(df)):
        can = df.iloc[i]
        text_a = can['text_a']
        text_b = can['text_b']
        if 'label' not in columns:
            label = None
        else:
            label = int(can['label'])
            if label == -1:
                label = None
        data.append([text_a, text_b, label])
    return data

def load_data():
    """
    :return: [text_a, text_b, label]
    天池疫情文本匹配数据集
    """
    data_dir = '../data/tianchi/'
    train_file = data_dir + 'train_20200228.csv'
    dev_file = data_dir + 'dev_20200228.csv'
    test_file = data_dir + 'test.example_20200228.csv'
    train_data = load_pair_data(train_file)
    val_data = load_pair_data(dev_file)
    test_data = load_pair_data(test_file)
    return train_data, val_data, test_data


def test_data_generator():
    data_dir = '../data/tianchi/'
    train_file = data_dir + 'train_20200228.csv'
    data = load_pair_data(train_file)
    train_generator = data_generator(data=data, batch_size=batch_size)
    for d in train_generator:
        print(d)
        break

def run():
    train_data, val_data, test_data = load_data()
    best_model_file = 'best_pet_model.weights'
    test_result_file = 'pet_submission.tsv'
    train(train_data, val_data, test_data, best_model_file, test_result_file)


if __name__ == '__main__':
    test_data_generator()
    run()



  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Bert是一种在自然语言处理中被广泛使用的模型,其在各种任务中表现出了出色的性能。然而,对于方面级情感分析,Bert并不直接适用。因此,需要对Bert进行利用,并通过修改和扩展来适应这一任务。 端到端(end-to-end)的方面级情感分析是指通过一个模型直接从文本中提取方面和情感信息。为了利用Bert进行端到端的方面级情感分析,首先需要对数据进行预处理,并将其转换成Bert模型所接受的输入格式。这包括将文本分段、添加特殊标记以及填充序列等操作。 在Bert模型的基础上,需要添加相关的层来实现方面级情感分析。一种常见的方法是利用注意力机制来捕获方面词与其他词之间的关系。通过计算不同词之间的注意力权重,可以将方面词的相关信息传递给其他词,从而更好地理解整个文本。另外,也可以添加一些分类层来预测每个方面的情感。 为了更好地利用Bert,还可以使用领域特定的语料库来进行预训练。通过在大规模的语料库上进行预训练,模型可以更好地理解特定领域的文本,并提升方面级情感分析的性能。 此外,还可以通过调整Bert模型的超参数来进一步改善性能。例如,可以调整学习率、批大小和训练周期等超参数,以获得更好的结果。 总之,“exploiting bert for end-to-end aspect-based sentiment analysis”意味着通过对Bert进行修改和扩展,将其应用于端到端的方面级情感分析任务中,以提升模型的性能和效果。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值