1. 读取文本
f = open("这里修改为你要读取文件的地址", "r",encoding='UTF-8')
txt = f.read()
txt = txt.lower()
f.close()
2.划分单词
array = re.split('[ ,.\n]', txt)
3.词频统计
dic = {}
for i in array:
if i not in dic:
dic[i] = 1
else:
dic[i] += 1
4. 除掉无价值的词
del [dic[''],dic['the'],dic['i'],dic['and'],dic['it'],dic['are'],dic['a'],dic['to'],dic['is']
, dic['my'],dic['this'],dic['for'],dic['of'],dic['that'],dic['in'],dic['have'],dic['at']
, dic['was'],dic['with'],dic['one'],dic['on'],dic['not'],dic['so'],dic['hair'],dic['dryer']
, dic['you'],dic["it's"],dic['had'],dic['has'],dic['be'],dic['dry'],dic['<br'],dic['/><br']
, dic['we']]
5. 输出出现频率最高的100个单词
print('\n')
print(order_dict1(dic, 100),)
6.完整代码
import os
import re
def order_dict(dicts, n):
result = []
result1 = []
p = sorted([(k, v) for k, v in dicts.items()], reverse=True)
s = set()
for i in p:
s.add(i[1])
for i in sorted(s, reverse=True)[:n]:
for j in p:
if j[1] == i:
result.append(j)
for r in result:
result1.append(r[0])
return result1
def order_dict1(dicts, n):
list1 = sorted(dicts.items(), key=lambda x: x[1])
return list1[-1:-(n + 1):-1]
if __name__ == "__main__":
f = open("这里修改为你要读取文件的地址", "r",encoding='UTF-8')
txt = f.read()
txt = txt.lower()
f.close()
array = re.split('[ ,.\n]', txt)
dic = {}
for i in array:
if i not in dic:
dic[i] = 1
else:
dic[i] += 1
del [dic[''],dic['the'],dic['i'],dic['and'],dic['it'],dic['are'],dic['a'],dic['to'],dic['is']
, dic['my'],dic['this'],dic['for'],dic['of'],dic['that'],dic['in'],dic['have'],dic['at']
, dic['was'],dic['with'],dic['one'],dic['on'],dic['not'],dic['so'],dic['hair'],dic['dryer']
, dic['you'],dic["it's"],dic['had'],dic['has'],dic['be'],dic['dry'],dic['<br'],dic['/><br']
, dic['we']]
print('\n')
print(order_dict1(dic, 100),)
7.运行结果