import os
import random
import time
import math
from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator
from datasets import load_dataset
from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
from paddlenlp.transformers import ErnieCtmForTokenClassification, ErnieCtmTokenizer
from paddlenlp.data import DataCollatorForTokenClassification
from paddlenlp.utils.log import logger
MODEL_CLASSES = {
"bert": (BertForTokenClassification, BertTokenizer),#bert
"ernie": (ErnieForTokenClassification, ErnieTokenizer),#ernie
"ernie-ctm": (ErnieCtmForTokenClassification, ErnieCtmTokenizer)#ernie-ctm
}
model_type='ernie'
model_name_or_path='ernie-3.0-base-zh'
dataset='msra_ner'
output_dir='./checkpoints/msra_ner'
max_seq_length=128
batch_size=10
learning_rate=5e-5
weight_decay=0.0
adam_epsilon=1e-8
max_grad_norm=1.0
num_train_epochs=3
warmup_steps=0
logging_steps=10
seed=1000
device='gpu'
a=[list(list(MODEL_CLASSES.values()) [1][-1].pretrained_init_configuration.keys())]
", ".join(sum(a,[]))#列表内必须也是列表,才能sum
paddle.set_device(device)
raw_datasets = load_dataset(dataset)
AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[model_type]
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
train_ds = raw_datasets['train']
label_list = train_ds.features['ner_tags'].feature.names
train_ds.features['ner_tags']
label_num = len(label_list)
no_entity_id