def get_tags_list(input_file):
#统计NER数据集中标签的种类
with open(input_file, 'r', encoding='utf-8') as f:
tags_list = []
lines = f.readlines()
seq_sum = 0
word_sum = 0
for line in lines:#对每一行
if line.isspace() == False:
for i,word in enumerate(line):
if word.isspace()==True:
word_sum +=1
# all_word = line[:i].strip()
all_tag = line[i+1:].strip()
if all_tag not in tags_list :
tags_list.append(all_tag)
break
else:
seq_sum += 1
f.close()
return tags_list,word_sum
def get_tags_num(input_file, tag_list):
#统计NER数据集中各个标签的数量
with open(input_file, 'r', encoding='utf-8') as f:
sum = 0
tag_id = {}
for i in tag_list:
tag_id[i]=0
# print(tag_id)
lines = f.readlines()
for line in lines:
if line.isspace()==False:
sum += 1
for i,word in enumerate(line):
if word.isspace()==True:
tag = line[i+1:].strip()
tag_id[tag]+=1
break
else:
continue
return tag_id, sum
if __name__ == '__main__':
input_file = './data/MsraNER/BIOES/test.txt'
tag_list = ['O', 'S-LOC', 'B-LOC', 'E-LOC', 'B-PER', 'I-PER', 'E-PER', 'B-ORG', 'I-ORG', 'E-ORG', 'I-LOC', 'S-PER', 'S-ORG']
# print(len(tag_list))
# tag_list, word_sum = get_tags_list(input_file)
# print(word_sum)
# print(tag_list)
result,num = get_tags_num(input_file, tag_list)
sum = 0
for i in result:
sum += result[i]
print(num)
print(sum)
print(result)
统计单个标签的数量
最新推荐文章于 2023-12-06 14:15:09 发布