import collections
def get_words(file):
with open (file) as f:
word_box = []
for line in f:
word_box.extend(line.lower().strip().split())
new_word_box = []
for word in word_box:
if word.isalpha():
new_word_box.append(word)
else:
new_word = ''
for letter in word:
if letter.isalpha():
new_word += letter
if new_word != '':
new_word_box.append(new_word)
return collections.Counter(new_word_box)
a = get_words(r'C:\Users\Administrator\Desktop\Article\emma.txt')
print(a.most_common(10))
分词(此时一句话最后的单词会包含","和"."),进行如下判断:
------>不包含标点的(即全是字母组成的词)------->直接加入词袋
------->包括标点的词------>处理之后加入词袋