import ast
import os
import json
import warnings
import random
from functools import partial
import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.transformers import AutoModelForTokenClassification, AutoTokenizer
from paddlenlp.metrics import ChunkEvaluator
def read_by_lines(path):
result = list()
with open(path, "r", encoding="utf8") as infile:
for line in infile:
result.append(line.strip())
return result
def write_by_lines(path, data):
with open(path, "w", encoding="utf8") as outfile:
[outfile.write(d + "\n") for d in data]
def load_dict(dict_path):
vocab = {}
for line in open(dict_path, 'r', encoding='utf-8'):
value, key = line.strip('\n').split('\t')
vocab[key] = int(value)
return vocab
num_epoch=3
learning_rate=6e-5
tag_path='./conf/DuEE1.0/trigger_tag.dict'
train_data='./datasets/DuEE1.0/trigger/train.tsv'
dev_data='./datasets/DuEE1.0/trigger/dev.tsv'
test_data='./datasets/DuEE1.0/trigger/test.tsv'
predict_data=None
warmup_proportion=0.0
batch_size=10
checkpoints='./checkpoints/Duee1.0_extract/'
init_ckpt=None
predict_save_path=None
seed=1000
device='gpu'
weight_decay=0.0
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
from paddlenlp.datasets import MapDataset
def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp) # Skip header
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
yield words, labels
if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]
paddle.set_device(device)
set_seed(seed)
no_entity_label = 'O'
ignore_label = -1
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
label_map = load_dict(tag_path)
id2label = {val: key for key, val in label_map.items()}
model = AutoModelForTokenClassification.from_pretrained(
"ernie-3.0-medium-zh", num_classes=len(label_map))
def convert_example_to_feature(example,
tokenizer,
label_vocab=None,
max_seq_len=512,
no_entity_label= 'O',
is_test=False):
tokens, labels = example
if not is_test:
assert len(tokens)==len(labels)
tokenized_input = tokenizer(tokens,#带上cls和els后的长度不超过max_length,截断时后截断
return_length=True,
is_split_into_words='token',
max_length=max_seq_len,truncation=True)
input_ids = tokenized_input['input_ids']
token_type_ids = tokenized_input['token_type_ids']
seq_len = tokenized_input['seq_len']
if is_test:
return input_ids, token_type_ids, seq_len
else:
# print(len(input_ids),len(labels))
if len(input_ids) - 2 < len(labels):#只有被截断时才会出现这种情况
labels = labels[:(max_seq_len - 2)]#截断
# print(len(labels))
#因为input_ids前后加了开始和结束符,所以标签也要相应加非实体标记
encoded_label_ = [no_entity_label] + labels + [no_entity_label]
encoded_label = [label_vocab[x] for x in encoded_label_]
return input_ids, token_type_ids, seq_len, encoded_label
a=convert_example_to_feature(train_ds[1],tokenizer,label_map,max_seq_len=10)
train_ds, dev_ds, test_ds =load_dataset(datafiles=( train_data,dev_data,test_data))
trans_func = partial(convert_example_to_feature,
tokenizer=tokenizer,
label_vocab=label_map,
no_entity_label=no_entity_label,
max_seq_len=max_seq_len,
is_test=False)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
), # input ids
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
), # token type ids
Stack(dtype='int64'), # sequence lens
Pad(axis=0, pad_val=ignore_label, dtype='int64') # labels
): fn(samples)
def create_dataloader(dataset,
mode='train',
batch_size=1,
batchify_fn=None,
trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == 'train' else False
batch_sampler = paddle.io.BatchSampler(dataset,
batch_size=batch_size,
shuffle=shuffle)
return paddle.io.DataLoader(dataset=dataset,
batch_sampler=batch_sampler,
collate_fn=batchify_fn,
return_list=True)
train_loader = create_dataloader(dataset=train_ds,
batch_size=batch_size,
batchify_fn=batchify_fn,trans_fn=trans_func)
dev_loader =create_dataloader(dataset=dev_ds,
batch_size=batch_size,mode='dev',
batchify_fn=batchify_fn,trans_fn=trans_func)
num_training_steps = len(train_loader) * num_epoch
warmup_steps=0
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps,
warmup_steps)
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=weight_decay,
apply_decay_param_fun=lambda x: x in decay_params)
metric = ChunkEvaluator(label_list=label_map.keys(),suffix=True)# 用于指定是否在某些输出或结果中添加后缀。
loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)#对于-1标签值不计算损失
@paddle.no_grad()
def evaluate(model, loss_fn, metric, data_loader, mode="valid"):
model.eval()
metric.reset()
total_loss, precision, recall, f1_score = 0, 0, 0, 0
for batch in data_loader:
input_ids,token_type_ids,seq_lens,labels=batch
logits = model(input_ids,token_type_ids)
loss = loss_fn(logits,labels)# loss
preds =paddle.argmax(logits,axis=-1)#模型预测标签索引
num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
seq_lens, preds, labels)
metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
num_correct_chunks.numpy())
total_loss+=loss.item()
precision, recall, f1_score = metric.accumulate()
avg_loss=total_loss/len(data_loader)
print("%s: eval loss: %.5f, precision: %.5f, recall: %.5f, f1: %.5f" %
(mode, avg_loss, precision, recall, f1_score))
metric.reset()
model.train()
return f1_score
logging_steps=15
import time
checkpoints=checkpoints+'trigger'
os.makedirs(checkpoints,exist_ok=True)
eval_steps=300
global_step,best_f1=0,0.0
tic_train = time.time()
model.train()
for epoch in range(1,num_epoch+1):
for step, batch in enumerate(train_loader,start=1):
input_ids,token_type_ids,seq_lens,labels=batch
logits = model(input_ids,token_type_ids)
loss = loss_fn(logits,labels)
if global_step %logging_steps == 0:
print("global step %d, epoch:%d,batch: %d, loss: %.4f, speed: %.2f step/s,best_f1:%.4f,lr:%.5f"
% (global_step,epoch,step, loss.item(), \
logging_steps /(time.time() - tic_train),best_f1,optimizer.get_lr()))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
global_step += 1
if global_step % eval_steps==0 :
f1_score=evaluate(model, loss_fn, metric,dev_loader)
if f1_score>best_f1:
best_f1=f1_score
paddle.save( model.state_dict(),os.path.join(
checkpoints,'best_trigger.pdparams'))
label_map = load_dict(tag_path)
ignore_label =-1
init_ckpt='./checkpoints/Duee1.0_extract/trigger/best_trigger.pdparams'
if not init_ckpt or not os.path.isfile(init_ckpt):
raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
else:
state_dict = paddle.load(init_ckpt)
model.set_dict(state_dict)
print("Loaded parameters from %s" % init_ckpt)
pred_data='./datasets/DuEE_1_0/test.json'
sentences = read_by_lines(pred_data) # origin data format
sentences = [json.loads(sent) for sent in sentences][:50]
max_seq_len=300
encoded_inputs_list = []
for sent in sentences:
sent = sent["text"].replace(" ", "\002")
input_ids, token_type_ids, seq_len = convert_example_to_feature(
[list(sent), []],
tokenizer,
max_seq_len=max_seq_len,
is_test=True)
encoded_inputs_list.append((input_ids, token_type_ids, seq_len))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'
), # input_ids
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
), # token_type_ids
Stack(dtype='int64') # sequence lens
): fn(samples)
batch_size=1
batch_encoded_inputs = [
encoded_inputs_list[i:i + batch_size]
for i in range(0, len(encoded_inputs_list),batch_size)
]
predict_save_path= './ckpt/DuEE1.0/trigger'
os.makedirs(predict_save_path, exist_ok=True)
for i in batch_encoded_inputs:
input_ids, token_type_ids, seq_lens = batchify_fn(i)
print(input_ids.shape,token_type_ids.shape,seq_lens.shape)
results = []
model.eval()
for batch in batch_encoded_inputs:
input_ids, token_type_ids, seq_lens = batchify_fn(batch)
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
logits = model(input_ids, token_type_ids)
probs = F.softmax(logits, axis=-1)
probs_ids = paddle.argmax(probs, -1).numpy()#预测的标签索引
probs = probs.numpy()
# print(probs.shape,probs_ids.shape,seq_lens.shape)
#p_list:(50,27),是一个样本中各个token的预测概率,p_ids是模型预测的样本中各个token的标签索引
#seq_len是当前样本序列的长度
for p_list, p_ids, seq_len in zip(probs, probs_ids, seq_lens):
prob_one = [
p_list[index+1][pid]
for index, pid in enumerate(p_ids[1:seq_len - 1])#切片去除开始结束符号
]
label_one = [id2label[pid] for pid in p_ids[1:seq_len - 1]]#转换索引为真实标签
results.append({"probs": prob_one,"labels": label_one})
assert len(results) == len(sentences)
for sent, ret in zip(sentences, results):
sent["pred"] = ret
for s in sentences:
probs=s['pred']['probs']
# print(probs)
s['pred']['probs']=[float(i) for i in probs]
sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]#json格式的字符串
write_by_lines(predict_save_path+'/test_pred.json', sentences)
print("save data {} to {}".format(len(sentences),predict_save_path))
evaluate(model,loss_fn,metric,dev_loader)