#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
reload(sys)
sys.setdefaultencoding("utf8")
def translate(str):
line = str.strip().decode('utf-8', 'ignore') # 处理前进行相关的处理,包括转换成Unicode等
p2 = re.compile(ur'[^\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5
zh = " ".join(p2.split(line)).strip()
sh = "".join(zh.split())
return sh
def txtToDic(f):
g = open(f,'r')
txt = g.readline()
txt = unicode(txt, "utf-8")
d = {}
for i in txt:
d[i] = txt.count(i)
return d
def sortDic(d):
d = sorted(d.items(), key=lambda x: x[1], reverse=False)
return d
if __name__ == '__main__':
f = open('test.txt', 'r')
g = open('get.txt', 'w')
for line in f.readlines():
if line == '\n':
continue
line = line.strip()
line = translate(line)
g.write(line)
f.close()
g.close()
g = 'get.txt'
dic = txtToDic(g)
dic = sortDic(dic)
g = open('get.txt', 'w')
for item in dic:
s = item[0] + ":" + str(item[1])
g.write(s + '\n')
g.close()