【机器学习+NER】手把手教你用机器学习CRF模型构建NER系统(CCL2021)
数据集来源:2021年中文计算语言学研究大会的智能对话诊疗评测比赛
任务:利用机器学习CRF模型构建NER系统,得到下图所示的评估指标
原始数据处理参考:医疗命名体识别之数据预处理(处理.json文件)
一、环境搭建
此处机器学习CRF模型主要使用sklearn_crfsuite库
调用CRF进行搭建;
可以通过:
pip install sklearn_crfsuite
安装sklearn_crfsuite库
二、数据类型处理
- 该数据集含有11类标签,分别为
“0”、“B-Symptom”、“I-Symptom”、“B-Drug_Category”、“I-Drug_Category”、“B-Drug”、“I-Drug”、“B-Medical_Examination”、“I-Medical_Examination”、“B-Operation”、“I-Operation”
给数据标签一个索引,即:# CCL2021数据标签: label2idx = {'O': 0, 'B-Symptom': 1, 'I-Symptom': 2, 'B-Drug_Category': 3, 'I-Drug_Category': 4, 'B-Drug': 5, 'I-Drug': 6, 'B-Medical_Examination': 7, 'I-Medical_Examination': 8,'B-Operation': 9, 'I-Operation': 10 }
- 将索引和标签一一对应,存储到
idx2label
中idx2label = {idx: label for label, idx in label2idx.items()}
- 读取字符字典文件
with open(char_vocab_path, "r", encoding="utf8") as fo: char_vocabs = [line.strip() for line in fo] char_vocabs = special_words + char_vocabs
- 将字符与索引编号对应,便于后续查找字符
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)} vocab2idx = {char: idx for idx, char in idx2vocab.items()}
- 读取训练语料,将原始数据7成划分为训练集,3成划分为测试集,返回数据和标签
# 读取训练语料 def read_corpus(corpus_path, vocab2idx, label2idx, flag): datas, labels = [], [] with open(corpus_path, encoding='utf-8') as fr: lines = fr.readlines() sent_, tag_ = [], [] if flag == "train": lines = lines[:int(len(lines) * 0.7)] else: lines = lines[int(len(lines) * 0.7):] for line in lines: if line != '\n': [char, label] = line.strip().split() sent_.append(char) tag_.append(label) else: sent_ids = [vocab2idx[char] if char in vocab2idx else vocab2idx['<UNK>'] for char in sent_] tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_] datas.append(sent_ids) labels.append(tag_ids) sent_, tag_ = [], [] return datas, labels # 加载训练集 7成 train_datas, train_labels = read_corpus(train_data_path, vocab2idx, label2idx, flag="train") # 加载测试集 3成 test_datas, test_labels = read_corpus(train_data_path, vocab2idx, label2idx, flag="test")
- 简单测试一下,看数据是否对应上了
print(train_datas[8]) print([idx2vocab[idx] for idx in train_datas[8]]) print(train_labels[8]) print([idx2label[idx] for idx in train_labels[8]]) # 输出结果为: #[1578, 5558, 2641, 5795, 2644, 3078, 2644, 939, 893, 3844, 3575, 946, 6821] #['宝', '贝', '最', '近', '有', '没', '有', '呕', '吐', '症', '状', '呢', '?'] #[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0] #['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Symptom', 'I-Symptom', 'O', 'O', 'O', 'O'] #第一行数据为字符对应的索引 #第二行为字符 #第三行为标签索引 #第四行为标签
- 转换数据格式,分别将数据与标签转换成CRF模型支持的数据格式
# 得到训练数据、训练数据标签 labels = [] datas = [] for i in range(len(train_labels)): datas.append([idx2vocab[idx] for idx in train_datas[i]]) train_datas = datas # print(train_datas) for i in range(len(train_labels)): labels.append([idx2label[idx] for idx in train_labels[i]]) train_labels = labels # print(train_labels) # 得到测试数据、测试数据标签 labels = [] datas = [] for i in range(len(test_labels)): datas.append([idx2vocab[idx] for idx in test_datas[i]]) test_datas = datas for i in range(len(test_labels)): labels.append([idx2label[idx] for idx in test_labels[i]]) test_labels = labels
三、训练模型
- 利用
sklearn_crfsuite
库调用CRF模型,默认采用lbfgs
算法crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True )
- 利用训练数据和训练数据标签训练模型
crf.fit(train_datas, train_labels)
- 利用测试数据,预测测试数据标签
test_pred = crf.predict(test_datas)
- 统计整个数据的标签信息,并进行排序
labels = list(crf.classes_) # labels.remove('O') sorted_labels = sorted( labels, key=lambda name: (name[1:], name[0]) )
- 为了满足
classification_report
的数据格式要求,我们对测试集数据标签与预测结果标签进行数据格式转换,转换成一位列表# 转换成一维列表 label = [] pred = [] for i in range(len(test_labels)): for j in range(len(test_labels[i])): label.append(test_labels[i][j]) test_labels = label for i in range(len(test_pred)): for j in range(len(test_pred[i])): pred.append(test_pred[i][j]) test_pred = pred
- 输出分类结果报告,得到结果
print(classification_report( test_labels, test_pred, target_names=sorted_labels ))
四、结果分析
从评估结果可以看出,模型对精准率的宏平均值macro avg=0.90
,加权平均值weighted avg=0.97
。由此可以得出模型预测的正样本中精确率达到90%以上;
模型对于召回率recall的宏平均值macro avg=0.89
,加权平均值weighted avg=0.97
,因而模型对于正样本的分类准确率能达到89%%,加权平均值weighted avg=0.97
说明大部分正样本的分类准确率为97%。
模型对于F1-score的宏平均值macro avg=0.89
,加权平均值weighted avg=0.97
,说明模型的召回率和精准率都很高,模型对于正样本的预测效果和正样本预测中的准确性都很好。
五、完整源码
import sklearn_crfsuite
from sklearn.metrics import classification_report
char_vocab_path = "./data/char_vocabs.txt" # 字典文件
train_data_path = "./data/train_data.txt" # 训练测试数据
special_words = ['<PAD>', '<UNK>'] # 特殊词表示
# CCL2021数据标签:
label2idx = {'O': 0,
'B-Symptom': 1, 'I-Symptom': 2,
'B-Drug_Category': 3, 'I-Drug_Category': 4,
'B-Drug': 5, 'I-Drug': 6,
'B-Medical_Examination': 7, 'I-Medical_Examination': 8,
'B-Operation': 9, 'I-Operation': 10
}
# 索引和BIO标签对应
idx2label = {idx: label for label, idx in label2idx.items()}
# print(idx2label)
# 读取字符词典文件
with open(char_vocab_path, "r", encoding="utf8") as fo:
char_vocabs = [line.strip() for line in fo]
char_vocabs = special_words + char_vocabs
# print(char_vocabs)
# 字符和索引编号对应
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}
# print(idx2vocab)
# print(idx2vocab)
# 读取训练语料
def read_corpus(corpus_path, vocab2idx, label2idx, flag):
datas, labels = [], []
with open(corpus_path, encoding='utf-8') as fr:
lines = fr.readlines()
sent_, tag_ = [], []
if flag == "train":
lines = lines[:int(len(lines) * 0.7)]
else:
lines = lines[int(len(lines) * 0.7):]
for line in lines:
if line != '\n':
[char, label] = line.strip().split()
sent_.append(char)
tag_.append(label)
else:
sent_ids = [vocab2idx[char] if char in vocab2idx else vocab2idx['<UNK>'] for char in sent_]
tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
datas.append(sent_ids)
labels.append(tag_ids)
sent_, tag_ = [], []
return datas, labels
# 加载训练集 7成
train_datas, train_labels = read_corpus(train_data_path, vocab2idx, label2idx, flag="train")
# 加载测试集 3成
test_datas, test_labels = read_corpus(train_data_path, vocab2idx, label2idx, flag="test")
# 输出看数据是否对应上了,"呕吐"
# print(train_datas[8])
# print([idx2vocab[idx] for idx in train_datas[8]])
# print(train_labels[8])
# print([idx2label[idx] for idx in train_labels[8]])
# 得到训练数据、训练数据标签
labels = []
datas = []
for i in range(len(train_labels)):
datas.append([idx2vocab[idx] for idx in train_datas[i]])
train_datas = datas
# print(train_datas)
for i in range(len(train_labels)):
labels.append([idx2label[idx] for idx in train_labels[i]])
train_labels = labels
# print(train_labels)
# 得到测试数据、测试数据标签
labels = []
datas = []
for i in range(len(test_labels)):
datas.append([idx2vocab[idx] for idx in test_datas[i]])
test_datas = datas
for i in range(len(test_labels)):
labels.append([idx2label[idx] for idx in test_labels[i]])
test_labels = labels
# 训练
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(train_datas, train_labels)
labels = list(crf.classes_)
# labels.remove('O')
# 预测
test_pred = crf.predict(test_datas)
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
# 转换成一维数组
label = []
pred = []
for i in range(len(test_labels)):
for j in range(len(test_labels[i])):
label.append(test_labels[i][j])
test_labels = label
for i in range(len(test_pred)):
for j in range(len(test_pred[i])):
pred.append(test_pred[i][j])
test_pred = pred
print(classification_report(
test_labels, test_pred, target_names=sorted_labels
))