In [3]:
#去掉特殊符号
def replacePunctuations(line):
for ch in line:
if ch in ',./<>?;\:"|[]{}`~!@#$%^&*()_+=-"':
line=line.replace(ch," ")
return line
#数每行的单词数
def processLine(line,wordCounts):
line=replacePunctuations(line)
words=line.split()
for word in words:
if word in wordCounts:
wordCounts[word]+=1
else:
wordCounts[word]=1
def main():
infile=open(r"C:\Users\Administrator\Desktop\test.txt","r")
wordCounts={}
for line in infile:
processLine(line.lower(),wordCounts)
pairs=list(wordCounts.items())
items=[[x,y] for [y,x] in pairs]
items.sort()
#s输出频率前十的单词
for i in range(len(items)-1,len(items)-11,-1):
print(items[i][1]+"\t"+str(items[i][0]))
main()
the 189 of 94 and 65 0 58 accidents 55 in 49 to 41 is 31 accident 28 can 23