import os
from functools import partial
import paddle
from paddlenlp.transformers import AutoModelForTokenClassification, AutoTokenizer
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
from information_extraction import(
data,utils
)
paddle.set_device('gpu')
train_ds, dev_ds, test_ds = data.load_dataset(datafiles=(
'./datasets/msra_ner/train.tsv',
'./datasets/msra_ner/dev.tsv','./datasets/msra_ner/test.tsv'))
for i in range(3):
text,label=test_ds[i]
print(''.join(text),'|',''.join(label))
label_vocab =data.load_dict_json('./datasets/msra_ner/label_map.json')
label_vocab
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
max_seq_len=128
BATCH_SIZE=32
trans_func = partial(utils.convert_to_features_maxlen, tokenizer=tokenizer,
label_vocab=label_vocab,max_seq_len=max_seq_len)
ignore_label =-1#计算损失时要忽略的(不是crf模型的设置)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id,dtype="int32"), # input_ids,填充0
Pad(axis=0, pad_val=tokenizer.pad_token_type_id,dtype="int32"), # token_type_ids, 填充0
Stack(dtype="int64"), # seq_len
# ignore_label,表示在训练过程中应该忽略这个标签索引的损失。
Pad(axis=0,pad_val=ignore_label,dtype="int64")
): [i for i in fn(samples)]
train_loader = utils.create_dataloader(
dataset=train_ds,
batch_size=BATCH_SIZE,
trans_fn=trans_func,
batchify_fn=batchify_fn)
model = AutoModelForTokenClassification.from_pretrained(\
"ernie-3.0-medium-zh", num_labels=len(label_vocab))
dev_loader = utils.create_dataloader(
dataset=dev_ds,mode='dev',
batch_size=BATCH_SIZE,
batchify_fn=batchify_fn,trans_fn=trans_func)
test_loader = utils.create_dataloader(
dataset=test_ds,mode='test',
batch_size=BATCH_SIZE,
batchify_fn=batchify_fn,trans_fn=trans_func)
for i1,i2,i3,i4 in test_loader:
print(i1.shape,i2.shape,i3.shape,i4.shape)
display(i1.numpy(),i2.numpy(),i3.numpy(),i4.numpy())
break
#度量模型性能的指标,suffix=True,意味着要考虑后缀,因为后缀也有语义信息
metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)# suffix:后缀
loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label,reduction='mean')
input_ = paddle.to_tensor([[0.,0.,0.,0.,0.1,0.1,0.1],[0.,0.,0.,0.,0.1,0.1,0.1]])
label = paddle.to_tensor([[1],[-1]])
dy_ret =loss_fn(input_, label)
print(dy_ret)
steps_per_epoch=len(train_loader)#每个轮次的步数,就是批次数
num_epochs=10
total_steps=steps_per_epoch*num_epochs#总步数
scheduler=utils.get_scheduler(total_steps)
# 定义优化器
optimizer = paddle.optimizer.Adam(learning_rate=scheduler,
parameters=model.parameters())
def train_epochs(epochs):
global_step=0
best_f1_score=0.
model_save_path='./checkpoints/msra/'
if not os.path.exists(model_save_path):
os.makedirs(model_save_path)
for epoch in range(epochs):
avg_loss,global_step = utils.train(\
model,train_loader,global_step,optimizer,loss_fn,scheduler)
print("epoch:%d - global_step:%d - loss: %.4f -best_score:%.5f -lr:%.5f" \
% (epoch, global_step, avg_loss,best_f1_score,optimizer.get_lr()))
precision, recall, f1_score=utils.evaluate(model,metric,dev_loader)
print("[EVAL] Precision: %.4f - Recall: %.4f - F1: %.4f" % (precision, recall, f1_score))
if f1_score>best_f1_score:
paddle.save(model.state_dict(),\
model_save_path+'best_1.pdparams')
best_f1_score=f1_score
train_epochs(num_epochs)#普通训练
model.set_state_dict(paddle.load('./checkpoints/msra/best_1.pdparams'))
utils.evaluate(model,metric,test_loader)
这个比飞浆nlp 示例的分数高多了
def parse_decodes(sentences, predictions, lengths, label_vocab,is_startswithB=False):
predictions = [x for batch in predictions for x in batch]#变成预测序列列表
lengths = [x for batch in lengths for x in batch]#变成单个sentence长度的列表
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))#idx-->label字典
outputs = []
for idx, end in enumerate(lengths):
sent = sentences[idx][:end]
tags = [id_label[x] for x in predictions[idx][:end]]#预测的单个句子的标签
# print(''.join(tags))
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
if not is_startswithB:
if t.endswith("-B") or t == "O":#表示新实体开始或者非实体
if len(words):
sent_out.append(words)
tags_out.append(t.split("-")[0])#添加实体标记
words = s#实体开始字符
else:
words += s
else:
if t.startswith("B-") or t == "O":
if len(words):
sent_out.append(words)
if t.startswith("B-"):
tags_out.append(t.split("-")[1])
else:
tags_out.append(t)
words = s
else:
words += s
if len(sent_out) < len(tags_out):#添加最后的字符
sent_out.append(words)
outputs.append("".join([str((s, t)) for s, t in zip(sent_out, tags_out)]))
return outputs
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))
def predict(model, data_loader, ds, label_vocab):#模型,数据生成器
model.eval()
with paddle.no_grad():
all_preds = []
all_lens = []
for input_ids, seg_ids, lens, labels in data_loader:
#获取模型预测(36,49,13)#36是批次,49是一个样本有多少token,13是一个token属于13个类别中的分值
logits = model(input_ids, seg_ids)
#获取token预测#(36,49),argmax是获取最后一维最大索引,就是对应token的预测
preds = paddle.argmax(logits, axis=-1)
# 不要[CLS]预测,切片
preds = [pred[1:] for pred in preds.numpy()]#这时样本这行会剩48个token
all_preds.append(preds)#把一个批次的预测加进去
all_lens.append(lens)
sentences = [example[0] for example in ds.data]#获取样本句子
results = parse_decodes(
sentences, all_preds, all_lens, label_vocab,is_startswithB=True)
return results
def ids2labels(ids):
return ''.join([id_label[k] for k in ids])
[ids2labels(i[3]) for i in test_ds][:50]
preds = predict(model,test_loader,test_ds,label_vocab)
save_dir='./demo_txts/'
if not os.path.exists(save_dir): os.makedirs(save_dir)
file_path = save_dir+"msra_ner_不带crf版.txt"
with open(file_path, "w", encoding="utf8") as fout:
fout.write("\n".join(preds))
# Print some examples
print("结果已经保存在: %s, 下面是一些示例: " % file_path)
print("\n".join(preds[:50]))