msra_ner实体识别

import os
from functools import partial
import paddle
from paddlenlp.transformers import AutoModelForTokenClassification, AutoTokenizer
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
from information_extraction import(
data,utils
)

paddle.set_device('gpu')

train_ds, dev_ds, test_ds = data.load_dataset(datafiles=(
        './datasets/msra_ner/train.tsv',
    './datasets/msra_ner/dev.tsv','./datasets/msra_ner/test.tsv'))

for i in range(3):
    text,label=test_ds[i]
    print(''.join(text),'|',''.join(label))

label_vocab =data.load_dict_json('./datasets/msra_ner/label_map.json')

label_vocab

 

tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")

max_seq_len=128

BATCH_SIZE=32

trans_func = partial(utils.convert_to_features_maxlen, tokenizer=tokenizer,
                     label_vocab=label_vocab,max_seq_len=max_seq_len)

ignore_label =-1#计算损失时要忽略的(不是crf模型的设置)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id,dtype="int32"),  # input_ids,填充0
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id,dtype="int32"),  # token_type_ids, 填充0
    Stack(dtype="int64"), # seq_len
    # ignore_label,表示在训练过程中应该忽略这个标签索引的损失。
    Pad(axis=0,pad_val=ignore_label,dtype="int64")
): [i for i in fn(samples)]

train_loader = utils.create_dataloader(
    dataset=train_ds,
    batch_size=BATCH_SIZE,
    trans_fn=trans_func,
    batchify_fn=batchify_fn)

model = AutoModelForTokenClassification.from_pretrained(\
            "ernie-3.0-medium-zh", num_labels=len(label_vocab))

dev_loader = utils.create_dataloader(
    dataset=dev_ds,mode='dev',
    batch_size=BATCH_SIZE,
    batchify_fn=batchify_fn,trans_fn=trans_func)

test_loader = utils.create_dataloader(
    dataset=test_ds,mode='test',
    batch_size=BATCH_SIZE,
    batchify_fn=batchify_fn,trans_fn=trans_func)

for i1,i2,i3,i4 in test_loader:
    print(i1.shape,i2.shape,i3.shape,i4.shape)
    display(i1.numpy(),i2.numpy(),i3.numpy(),i4.numpy())
    break

#度量模型性能的指标,suffix=True,意味着要考虑后缀,因为后缀也有语义信息
metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)# suffix:后缀

loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label,reduction='mean')

input_ =  paddle.to_tensor([[0.,0.,0.,0.,0.1,0.1,0.1],[0.,0.,0.,0.,0.1,0.1,0.1]])
label =  paddle.to_tensor([[1],[-1]])
dy_ret =loss_fn(input_, label)
print(dy_ret)

steps_per_epoch=len(train_loader)#每个轮次的步数,就是批次数
num_epochs=10
total_steps=steps_per_epoch*num_epochs#总步数

scheduler=utils.get_scheduler(total_steps)

# 定义优化器
optimizer = paddle.optimizer.Adam(learning_rate=scheduler,
                                  parameters=model.parameters())

def train_epochs(epochs):
    global_step=0
    best_f1_score=0.
    model_save_path='./checkpoints/msra/'
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    for epoch in range(epochs):
        avg_loss,global_step = utils.train(\
            model,train_loader,global_step,optimizer,loss_fn,scheduler)
        print("epoch:%d - global_step:%d - loss: %.4f -best_score:%.5f -lr:%.5f" \
              % (epoch, global_step, avg_loss,best_f1_score,optimizer.get_lr()))
        precision, recall, f1_score=utils.evaluate(model,metric,dev_loader)
        print("[EVAL] Precision: %.4f - Recall: %.4f - F1: %.4f" % (precision, recall, f1_score))
        if f1_score>best_f1_score:
            paddle.save(model.state_dict(),\
                model_save_path+'best_1.pdparams')
            best_f1_score=f1_score

train_epochs(num_epochs)#普通训练

 

model.set_state_dict(paddle.load('./checkpoints/msra/best_1.pdparams'))

utils.evaluate(model,metric,test_loader)

 

这个比飞浆nlp 示例的分数高多了

def parse_decodes(sentences, predictions, lengths, label_vocab,is_startswithB=False):
    predictions = [x for batch in predictions for x in batch]#变成预测序列列表
    lengths = [x for batch in lengths for x in batch]#变成单个sentence长度的列表
    id_label = dict(zip(label_vocab.values(), label_vocab.keys()))#idx-->label字典
    outputs = []
    for idx, end in enumerate(lengths):
        sent = sentences[idx][:end]
        tags = [id_label[x] for x in predictions[idx][:end]]#预测的单个句子的标签
        # print(''.join(tags))
        sent_out = []
        tags_out = []
        words = ""
        for s, t in zip(sent, tags):
            if not is_startswithB:
                if t.endswith("-B") or t == "O":#表示新实体开始或者非实体
                    if len(words):
                        sent_out.append(words)
                    tags_out.append(t.split("-")[0])#添加实体标记
                    words = s#实体开始字符
                else:
                    words += s
            else:
                if t.startswith("B-") or t == "O":
                    if len(words):
                        sent_out.append(words)
                    if t.startswith("B-"):
                        tags_out.append(t.split("-")[1])
                    else:
                        tags_out.append(t)
                    words = s
                else:
                    words += s
        if len(sent_out) < len(tags_out):#添加最后的字符
            sent_out.append(words)
        outputs.append("".join([str((s, t)) for s, t in zip(sent_out, tags_out)]))
    return outputs

id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

def predict(model, data_loader, ds, label_vocab):#模型,数据生成器
    model.eval()
    with paddle.no_grad():
        all_preds = []
        all_lens = []
        for input_ids, seg_ids, lens, labels in data_loader:
            #获取模型预测(36,49,13)#36是批次,49是一个样本有多少token,13是一个token属于13个类别中的分值
            logits = model(input_ids, seg_ids)
            #获取token预测#(36,49),argmax是获取最后一维最大索引,就是对应token的预测
            preds = paddle.argmax(logits, axis=-1)
            # 不要[CLS]预测,切片
            preds = [pred[1:] for pred in preds.numpy()]#这时样本这行会剩48个token
            all_preds.append(preds)#把一个批次的预测加进去
            all_lens.append(lens)
        sentences = [example[0] for example in ds.data]#获取样本句子
        results = parse_decodes(
            sentences, all_preds, all_lens, label_vocab,is_startswithB=True)
        return results

def ids2labels(ids):
    return ''.join([id_label[k] for k in ids])

[ids2labels(i[3]) for i in test_ds][:50]

preds = predict(model,test_loader,test_ds,label_vocab)
save_dir='./demo_txts/'
if not os.path.exists(save_dir): os.makedirs(save_dir)
file_path = save_dir+"msra_ner_不带crf版.txt"
with open(file_path, "w", encoding="utf8") as fout:
    fout.write("\n".join(preds))
# Print some examples
print("结果已经保存在: %s, 下面是一些示例: " % file_path)
print("\n".join(preds[:50]))

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值