def getText():
txt = open("sj.txt", "r").read()
txt = txt.lower()
for ch in ",.? ——--- ________ _____________________ ,+ ;【 】 $ % ‘“ '': ”。; / …… ( ) \" 1234567890":
txt = txt.replace(ch, " ")
txt = "".join(i for i in txt if ord(i) < 256 )
return txt
sjTxt = getText()
words = sjTxt.split()
counts = {}
for word in words:
counts[word] = counts.get(word, 0)+1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(10): #只打印前十个词频,若全部打印则改为for i in range(len(items):
word, count = items[i]
print("{0:<10}{1:<6}".format(word, count), end="\t")
if i > 0 and(i+1) % 5 == 0:
print("\n")
程序执行结果:
the 369 a 276 to 254 of 164 and 161
in 157 is 125 b 96 c 90 d 87