# -*- coding: utf-8 -*-
"""
Created on Fri Jan 6 11:17:32 2017
@author: yang
"""
fid = open('./file.txt')
fidw = open('./file_save.txt','w')
lines = fid.readlines()
character = {}
exclude = ['','','”','“',',',':',';',')','(', '《','》','~','【','】','、','。','①','③','②','④','⑤','⑥','⑦','⑧','⑨']
for line in lines:
line = line.strip()
temp = ''
for i in range(len(line)):
if ord(line[i]) > 127:
temp=temp+line[i]
line = temp
size = len(line)/3
#print line,len(line)
if len(line)%3 != 0:
continue
for i in range(size):
c = line[i*3:i*3+3]
#print c,len(c)
if c not in character.keys():
character[c]=1
else:
character[c]+=1
for key in character.keys():
#print key, character[key]
if key in exclude:
continue
fidw.writelines(key+' '+str(character[key])+'\n')
fid.close()
fidw.close()
python 简单文本汉字提取
最新推荐文章于 2024-08-08 16:43:27 发布