一,统计每个文本长度,并找出最大长度,最小长度,同时统计长度出现频次。
1.文本
2.数据统计代码
def get_text_length(dir):
entities={}
text_max=0
i=0
text_min=999999
files=os.listdir(dir)
files=list(set([file.split('.')[0] for file in files if (".ann" in file or ".txt" in file) ]))
l=[0]*3037
for file in files:
path=os.path.join(dir,file+'.txt')
lens=0
with open(path,'r',encoding='utf8') as f:
for line in f.readlines():
lens=len(line)
l[lens]+=1
i+=1
if(lens<50):
print(line)
text_max=max(text_max,lens)
text_min=min(text_min,lens)
return text_max,text_min,l
tmax,tmin,l=get_text_length(test_dir)
print(tmax,tmin)
3.绘画直方图
plt.figure(figsize=(18,8))
plt.title('Length statistics of text',fontsize=13)
plt.xlabel(u'length',fontsize=13)
plt.ylabel(u'quantity',fontsize=13)
#plt.bar(data[i for i in count.keys()],data[i for i in count.values()],alpha=0.6,width=0.8,facecolor='deeppink',edgecolor='darkblue',w=1,label='number of class')
plt.bar(range(len(l)),l,width=0.8,edgecolor='#0077CC',lw=1)
fig=plt.gcf()
#plt.legend(loc=2)
plt.show()
fig.savefig('./length_statistics_of_text.png')
二,统计实体名,出现的长度频次
1.文本
2.统计数据代码
def get_entities(dir):
entities={}
text_max=0
text_min=999999
files=os.listdir(dir)
lens=0
files=list(set([file.split('.')[0] for file in files if (".ann" in file or ".txt" in file) ]))
for file in files:
path=os.path.join(dir,file+'.ann')
with open(path,'r',encoding='utf8') as f:
for line in f.readlines():
name=line.split('\t')[2]
# print(name)
lens=len(name)
if(lens==1 or lens==13):
print(line)
if lens in entities:
entities[lens]+=1
else:
entities[lens]=1
text_max=max(text_max,lens)
text_min=min(text_min,lens)
return text_max,text_min,entities
# print(name)
# if name in entities:
# entities[name]+=1
# else:
# entities[name]=1
# return entities
text_max,text_min,entities=get_entities(train_dir)
text_max,text_min,entities
3.直方图
from matplotlib import pyplot as plt
print(entities)
entities_order=sorted(entities.items(),key=lambda x:x[0],reverse=False)# # 按字典集合中,每一个元组的第一个元素排列,相当于字典集合中遍历出来的一个元组。
#dict(list(entities_order))
print(entities_order)
# print([i for i in entities_order.keys()])
# print([i for i in entities_order.values()])
dic={i[0]:i[1] for i in entities_order}#列表转换为字典
print(dic)
print([i for i in dic.keys()])
print([i for i in dic.values()])
num_list = [i for i in dic.values()]
num_list[-2]+=1
num_list=num_list[:-1]
def autolabel(rects):
for rect in rects:
height =rect.get_height()
plt.text(rect.get_x() + rect.get_width()/2, height, height, ha='center', va='bottom')#垂直和水平的布局
#rect.get_x(),1.03*height,'%s' % int(height))
plt.xticks(range(len(num_list)), name_list, rotation=0)
plt.figure(figsize=(18,8))
plt.title('Length statistics of entity',fontsize=13)
plt.xlabel(u'length',fontsize=13)
plt.ylabel(u'quantity',fontsize=13)
#plt.bar(data[i for i in count.keys()],data[i for i in count.values()],alpha=0.6,width=0.8,facecolor='deeppink',edgecolor='darkblue',w=1,label='number of class')
autolabel(plt.bar(range(len(num_list)),num_list,width=0.8,edgecolor='darkblue',lw=1))
fig=plt.gcf()
#plt.legend(loc=2)
plt.show()
fig.savefig('./length_statistics_of_entity.png')
4.结果