from statistics import mode
import paddle
import paddle;
import os;
from functools import partial;
from model import ErnieCrfForTokenClassification
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import AutoTokenizer, AutoModelForTokenClassification
from paddlenlp.metrics import ChunkEvaluator
from data import load_dict, load_dataset, parse_decodes
@paddle.no_grad()
def predict(model, data_loader, ds, label_vocab):
all_preds = []
all_lens = []
for input_ids, seg_ids, lens, labels in data_loader:
# print(f"input_ids:{input_ids.shape}")
# print(f"seg_ids:{seg_ids.shape}")
# print(f"input_ids:{input_ids}")
# print(f"seg_ids:{seg_ids}")
# print(f"lens:{lens}");
preds = model(input_ids, seg_ids, lengths=lens)
# Drop CLS prediction
preds = [pred[1:] for pred in preds.numpy()]
all_preds.append(preds)
all_lens.append(lens)
sentences = [example[0] for example in ds.data]
# print(f"sentences:{sentences}")
# print(f"all_preds:{all_preds}")
# print(f"all_lens:{all_lens}")
# print(f"label_vocab:{label_vocab}")
results = parse_decodes(sentences, all_preds, all_lens, label_vocab)
return results
def convert_to_features(example, tokenizer, label_vocab):
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
# Token '[CLS]' and '[SEP]' will get label 'O'
labels = ['O'] + labels + ['O']
tokenized_input['labels'] = [label_vocab[x] for x in labels]
return tokenized_input['input_ids'], tokenized_input[
'token_type_ids'], tokenized_input['seq_len'], tokenized_input['labels']
# Create dataset, tokenizer and dataloader.
if __name__ == "__main__":
print(f"my excercise ....")
paddle.set_device("cpu")
dic = paddle.load("ernie_crf_ckpt/model_80")
# print(f"dic: {dic}")
label_vocab = load_dict(os.path.join("./data", 'tag.dic'))
ernie = AutoModelForTokenClassification.from_pretrained(
"ernie-3.0-medium-zh", num_classes=len(label_vocab))
model = ErnieCrfForTokenClassification(ernie)
model.set_state_dict(dic)
test_ds = load_dataset(
datafiles=(os.path.join("./data" ,'mytest.txt')))
label_vocab = load_dict(os.path.join('./data' ,'tag.dic'))
tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')
trans_func = partial(convert_to_features,
tokenizer=tokenizer,
label_vocab=label_vocab)
test_ds.map(trans_func)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input_ids
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
), # token_type_ids
Stack(dtype='int64'), # seq_len
Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64') # labels
): fn(samples)
test_loader = paddle.io.DataLoader(dataset=test_ds,
batch_size=10,
return_list=True,
collate_fn=batchify_fn)
print("\n".join(preds[:10]))
address = '上海市青浦区重固镇福贸路摩登丰巢2单元1343号18661203333王志平'
# address = "安徽省枞阳县会宫乡安凤村石墙村"
address = [[char for char in address]]
tokenized_input = tokenizer(address,
return_length=True,
is_split_into_words=True)
print(tokenized_input)
input_ids = tokenized_input['input_ids']
token_type_ids = tokenized_input['token_type_ids']
seq_len = tokenized_input['seq_len']
# input_ids , token_type_ids, seq_len = paddle.to_tensor(input_ids),paddle.to_tensor(token_type_ids),paddle.to_tensor(seq_len)
print("====================================================================")
input_ids = paddle.to_tensor(input_ids,dtype="int32")
token_type_ids = paddle.to_tensor(token_type_ids,dtype='int32')
seq_len = paddle.to_tensor(seq_len,dtype='int64')
print(f"token ==== {seq_len}")
preds = model(input_ids, token_type_ids, lengths = seq_len)
print(preds)
all_preds = []
all_lens = []
preds = [pred[1:] for pred in preds.numpy()]
# print(preds)
all_preds.append(preds)
all_lens.append(seq_len)
sentences = address
results = parse_decodes(sentences, all_preds, all_lens, label_vocab)
print(results)
原代码出自百度飞桨