# from .help import flatten_lists
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.metrics import precision_score
from seqeval.metrics import recall_score
def flatten_lists(lists):
"""将list of list 压平成list"""
flatten_list = []
for list_ in lists:
if type(list_) == list:
flatten_list.extend(list_)
else:
flatten_list.append(list_)
return flatten_list
# 在计算准确率时,我抛弃了标签级准确率,选择了对ner任务更为合理的实体级准确率
def _find_tag(labels, B_label="B-defect",I_label="M-defect", E_label="E-defect", S_label="S-defect"):
result = []
lenth = 0
for num in range(len(labels)):
if labels[num] == B_label:
song_pos0 = num
if labels[num] == B_label and labels[num+1] == E_label:
lenth = 2
result.append((song_pos0,lenth))
if labels[num] == I_label and labels[num-1] == B_label:
lenth = 2
for num2 in range(num,len(labels)):
if labels[num2] == I_label and labels[num2-1] == I_label:
lenth += 1
if labels[num2] == E_label:
lenth += 1
result.append((song_pos0,lenth))
break
if labels[num] == S_label:
lenth = 1
song_pos0 = num
result.append((song_pos0,lenth))
return result
#桥梁数据集的标签,BIO格式修改为BMES之后的。
tags = [("B-defect","M-defect","E-defect","S-defect"),
("B-comp","M-comp","E-comp","S-comp"),
("B-action","M-action","E-action","S-action"),
("B-action","M-action","E-action","S-action"),
("B-Inspec","M-Inspec","E-Inspec","S-Inspec")]
def find_all_tag(labels):
result = {}
for tag in tags:
res = _find_tag(labels, B_label=tag[0], I_label=tag[1], E_label=tag[2], S_label=tag[3])
result[tag[0].split("-")[1]] = res
return result
def precision(pre_labels,true_labels):
'''
计算实体级查准率(精确率),也就是预测出m个实体,其中n个是正确预测的。也就是n/m
:param pre_tags: list
:param true_tags: list
:return:
'''
print("评价实体级查准率(精确率)的输入数据为:")
print(pre_labels)
print(true_labels)
pre = []
pre_labels = flatten_lists(pre_labels) #将二维数组转成一维数组
true_labels = flatten_lists(true_labels)
pre_result = find_all_tag(pre_labels) #找到每类实体的起始字的索引和实体长度
true_result = find_all_tag(true_labels)
result_dic = {}
for name in pre_result:
for x in pre_result[name]:
if result_dic.get(name) is None:
result_dic[name] = []
if x:
if pre_labels[x[0]:x[0]+x[1]] == true_labels[x[0]:x[0]+x[1]]:
result_dic[name].append(1)
else:
result_dic[name].append(0)
# print(f'tag: {name} , length: {len(result_dic[name])}')
sum_result = 0
for name in result_dic:
sum_result += sum(result_dic[name])
# print(f'tag2: {name} , length2: {len(result_dic[name])}')
result_dic[name] = sum(result_dic[name]) / len(result_dic[name])
for name in pre_result:
for x in pre_result[name]:
if x:
if pre_labels[x[0]:x[0]+x[1]] == true_labels[x[0]:x[0]+x[1]]:
pre.append(1)
else:
pre.append(0)
total_precision = sum(pre)/len(pre)
print("total_precision:",total_precision)
print("result_dic:", result_dic)
return total_precision, result_dic
#使用seqeval计算精确率、召回率和F1值的代码
# https://zhuanlan.zhihu.com/p/495414141
'''示例数据
y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
'''
def getMetrics(pre_labels, true_labels):
precision=precision_score(true_labels, pre_labels)
recall=recall_score(true_labels, pre_labels)
f1=f1_score(true_labels, pre_labels)
accuracy_sco=accuracy_score(true_labels, pre_labels)
print(classification_report(true_labels, pre_labels))
print("precision,recall,f1:",precision,recall,f1)
print("accuracy_sco:",accuracy_sco)
return precision,recall,f1
if __name__ == "__main__":
print("")
y_true = [['B-comp', 'E-comp', 'O','O', 'S-action', 'S-action', 'B-defect', 'M-defect', 'E-defect', 'S-action'], ['B-comp', 'E-comp', 'O', 'S-action','B-defect', 'E-defect', 'O']]
y_pred = [['O', 'O', 'O','O', 'S-action', 'O', 'B-defect', 'M-defect', 'E-defect', 'S-action'], ['B-comp', 'E-comp', 'S-action', 'O', 'O', 'B-defect', 'O']]
'''
真实的标注数据中:comp实体有2个,defect实体有2个。action实体有4个。总共有8个实体。也就是support:当前行的类别在测试数据中的样本总量。这个指标对于评估模型性能非常重要,因为如果某个类别的 support 很小,则评估指标(如精确率和召回率等)可能会变得不稳定,从而难以准确地评估模型在该类别上的性能。通常情况下,较大的 support 意味着更可靠的评估结果。
预测action实体有3个,只预测对2个,精确率(Precision)为2/3=0.66666,召回率为2/4=0.5,f1-score=0.5711
预测数据中,预测comp实体有1个,且这1个comp实体预测正确了; 对于comp实体,查准率或叫精确率(Precision)为1。实际上comp实体有2个,只正确预测出1个,召回率为0.5,f1-score=0.67
预测defect实体有2个(将含有bme标签的连续词语或孤立的字也作为是预测出的实体),但是只预测对1个defect实体,精确率(Precision)为0.5,召回率为0.5,f1-score=0.5
也即:seqeval.metrics,总体上的准确度是针对标签级的,不是实体级,预测标签与真实标签匹配数目(10)/真实数据中的标签总数(17)=0.05882
此外,总共预测了6个实体,其中4个是正确的,所以精确率为4/6=0.666。总体上是有8个实体,正确预测出4个,总体召回率为0.5;总体上的f1-score=0.5714
'''
getMetrics(y_pred,y_true)
'''
precision recall f1-score support
action 0.67 0.50 0.57 4
comp 1.00 0.50 0.67 2
defect 0.50 0.50 0.50 2
micro avg 0.67 0.50 0.57 8
macro avg 0.72 0.50 0.58 8
weighted avg 0.71 0.50 0.58 8
precision,recall,f1: 0.6666666666666666 0.5 0.5714285714285715
accuracy_sco: 0.5882352941176471
'''
print("===============")
precision(y_pred,y_true)
'''
total_precision: 0.8 (没有把'B-defect'作为1个defect实体)
result_dic: {'defect': 1.0, 'comp': 1.0, 'action': 0.6666666666666666}
'''
seqeval.metrics实体识别中准确率、精确率、召回率、f1计算实例
于 2023-04-25 21:09:53 首次发布