命名实体识别数据预处理
得到实体类别数量
import os
train_dir = 'datas/ruijin_round1_train2_20181022'#文件的路径
def get_entities(dir):#定义得到实体种类和数量的方法
entities = {}
files = os.listdir(dir)
files = list(set([file.split('.')[0] for file in files]))#得到所有文件的名字
for file in files:#循环得到所有实体数量和类型
path = os.path.join(dir,file+'.ann')
with open(path,'r',encoding='utf-8') as f:
for line in f.readlines():
names = line.split('\t')[1].split(' ')[0]
if names in entities : #实体存在数量+1
entities[names]+=1
else:
entities[names] = 1 #实体不存在,为1
return entities
if __name__ == '__main__':
print(len(get_entities(train_dir)))#去掉len得到每个实体出现的次数
定义标签,给出标签的 B I O 标记
def get_labelencoder(entities) :
entities = sorted(entities.items(),key=lambda x:x[1],reverse=True)
entities = [x[0] for x in entities]
id2label = []
id2label.append('O')
for entity in entities :
id2label.append('B-'+entity)
id2label.append('I-'+entity)
label2id = {id2label[i]:i for i in range(len(id2label))}
return id2label,label2id
通过写的这两种方法,就可以得到数据中有多少实体种类,定义各类别的B I O 标注