展开全部
假设你的系统为 Windows,即中文环境编码为 gbk。看代码:# -*- encoding: gbk -*-
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4E00' and uchar <= u'\u9FA5':
e68a84e8a2ad62616964757a686964616f31333337623531return True
else:
return False
def count_chinese_word(filepath, encoding):
_dict = {}
try:
with open (filepath, 'r') as txt_file:
for line in txt_file:
ustr = line.decode(encoding)
for uchar in ustr:
if is_chinese(uchar):
if _dict.has_key(uchar):
_dict[uchar] = _dict[uchar] + 1
else:
_dict[uchar] = 1
except IOError as ioerr:
print "文件",filepath,"不存在"
return _dict
if __name__ == '__main__':
_dict = count_chinese_word('内容.txt', 'gbk')
import json
print json.dumps(_dict, encoding = "utf-8", indent = 4, ensure_ascii = False)
内容.txt:
运行: