文章目录
1、介绍
1.1、任务描述
任务描述——识别文本中具有特定意义的实体,包括人名,地名,机构名;
通常包括两部分:
-
实体边界识别
-
确定实体类型(人名,地名,机构名或其他)
例如:小明在北京上班
任务:
1、识别地名
2、识别人名
1.2、标注体系
- 数据标注体系
- IOB1,IOB2,IOE1,IOE2,IOBES,BILOU
- IOB2标注
- I表示实体内部,O表示实体外部,B表示实体开始
- B/I-XXX,XXX表示具体的类别
- IOBES标注
-
I表示实体内部,O表示实体外部,B表示实体开始,E表示实体结束,S表示一个词单独形成一个命名实体
-
有时也会用M代替I,本质一样
-
标记 | 说明 |
---|---|
B-Person | 人名开始 |
I-Person | 人名中间 |
B-Organization | 组织名开始 |
I-Organization | 组织名中间 |
O | 非命名实体 |
1.3、评估指标
-
Precision
-
Recall
-
F1
2、代码实战
2.1、导包
from transformers import (AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer,
DataCollatorForTokenClassification)
from datasets import load_dataset
import evaluate
import requests
2.2、加载数据集
ner_datasets = load_dataset('peoples_daily_ner',cache_dir='./data',trust_remote_code=True)
查看数据集
ner_datasets['train'][0]
ner_datasets['test'].features
label_list = ner_datasets['train'].features['ner_tags'].feature.names
label_list
2.3、数据预处理
tokenizer = AutoTokenizer.from_pretrained("../Model/Chinese-macbert-base")
#attention_mask 避免用注意力机制的时候关注到填充符
#token_type_ids 数字编码中哪些属于第一个句子,哪些属于第二个句子
print(tokenizer(ner_datasets['train'][0]['tokens'],is_split_into_words=True))#对于已经tokenizerd的数据,要制定is_split_into_words=True
res = tokenizer('integer world')
res = tokenizer('你好')
res
def process_function(examples):
tokenized_examples = tokenizer(examples["tokens"],
max_length=128,
is_split_into_words=True,
truncation=True,
#padding="max_length"
)
labels = []
for i ,label in enumerate(examples["ner_tags"]):
word_ids = tokenized_examples.word_ids(batch_index=i)
label_ids = []
for word_id in word_ids:
if word_id is None:
label_ids.append(-100)
else:
label_ids.append(label[word_id])
labels.append(label_ids)
tokenized_examples['labels'] = labels
return tokenized_examples
tokenized_datasets = ner_datasets.map(process_function,batched=True)
print(tokenized_datasets['train'][0])
2.4、创建模型
model = AutoModelForTokenClassification.from_pretrained('../Model/chinese-macbert-base',num_labels=len(label_list))
model.config.num_labels
2.5、创建评估函数
seqeval = evaluate.load('seqeval_metric.py')
seqeval
import numpy as np
def eval_metric(pred):
predictions , labels = pred # prediction是一个概率值
predictions = np.argmax(predictions,axis=-1)
#数值标签转换为字符串标签
#对一批数据进行处理
###########################
true_predictions = [
[label_list[p] for p,l in zip(prediction,label) if l!=-100]
for prediction,label in zip(predictions,labels)
]
###########################
true_labels = [
[label_list[l] for p,l in zip(prediction,label) if l!=-100]
for prediction,label in zip(predictions,labels)
]
result = seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict", scheme="IOB2")
return {
"f1": result["overall_f1"]
}
2.6、创建配置函数
args = TrainingArguments(
output_dir='model_for_ner',
per_device_train_batch_size = 8,
per_device_eval_batch_size = 8,
gradient_accumulation_steps=32, # 梯度累加 现存减少,训练时间延长
gradient_checkpointing= True, #优化
optim='adafactor', #指定优化器
eval_strategy= 'epoch',
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model='f1',
logging_steps=50,
num_train_epochs=1,
)
2.7、创建Trainer
for name,params in model.bert.named_parameters():
params.requires_grad = False
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
compute_metrics=eval_metric,
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)
2.8、训练
trainer.train()
2.9、模型预测
from transformers import pipeline
model.config.id2label = {idx:label for idx,label in enumerate(label_list)}
ner_pipe = pipeline('token-classification',model=model,tokenizer=tokenizer,device=0,aggregation_strategy='simple')
res = ner_pipe('小明在北京上班')
res
ner_results ={}
x = "小明在北京上班"
for r in res:
if r['entity_group'] not in ner_results:
ner_results[r['entity_group']] = []
ner_results[r['entity_group']].append(x[r['start'] : r['end'] ])
ner_results