deftokenize_and_align_labels(example, tokenizer, no_entity_id, label_vocab, max_seq_len=512):"""
pass
"""
labels = example['labels']
example = example['tokens']
tokenized_input =tokenizer(
example,
return_length=True,
is_split_into_words=True,
max_seq_len=max_seq_len)# -2 for [CLS] and [SEP]iflen(tokenized_input['input_ids'])-2<len(labels):
labels = labels[:len(tokenized_input['input_ids'])-2]# Read custom data locally, the system will not automatically convert it, you must manually convert label to id
tokenized_input['labels']=[no_entity_id]+[label_vocab[x]for x in labels]+[no_entity_id]
tokenized_input['labels']+=[no_entity_id]*(len(tokenized_input['input_ids'])-len(tokenized_input['labels']))return tokenized_input
一、自定义数据读取函数,并且做好读取配置,注意data_path这个参数必须对应好,名字和load_dataset保持一致 def read_out(data_path): """ pass """ with open(data_path, 'r', encoding='utf-8') as f: for line in f: line_stripped = line.strip().split('\t')