读取文件,并用字典的方式来输出每个字的字频
mylist = open(r"C:\Users\gulu\Desktop\python\金庸小说 txt\碧血剑.txt","r").read()
count = {}
for character in mylist:
# setdefault方法调用确保了键存在于 count 字典中(默认值是 0),再次走到这里赋值为0是不会成功的
count.setdefault(character, 0)
count[character] += 1
print(count)
利用pandas来进行数据处理,并输出前10
# 转化成series一维数组
count1 = pd.Series(count)
aa = count1.sort_values( ascending = False) # 升序
# 文本清洗,删去标点符号
bb = aa.drop([',',"。",'”','“',':','?','\n',' '])
# 用切片的方式输出前十
print(bb[1:11])
完整代码
import pandas as pd
mylist = open(r"C:\Users\gulu\Desktop\python\金庸小说 txt\碧血剑.txt","r").read()
count = {}
for character in mylist:
# setdefault方法调用确保了键存在于 count 字典中(默认值是 0),再次走到这里赋值为0是不会成功的
count.setdefault(character, 0)
count[character] += 1
# 转化成series一维数组
count1 = pd.Series(count)
aa = count1.sort_values( ascending = False) # 升序
# 文本清洗,删去标点符号
bb = aa.drop([',',"。",'”','“',':','?','\n',' '])
# 用切片的方式输出前十
print(bb[1:11])