设计思路:
1.找到目标文件路径,并读取文件
import jieba
with open('data/data131368/test.txt', 'r', encoding = 'UTF-8') as novelFile:
novel = novelFile.read()
2.处理停用词,防止对结果分析产生干扰
stopwords = [line.strip() for line in open('data/data131368/stop.txt', 'r', 'UTF-8').readlines()]
novelList = list(jieba.lcut(novel)) #jieba.luct 把文章词语精确地分开
在这里插入代码片
```stopwords = [line.strip() for line in open().readlines]
#此处 line.strip() 是把line中的文字转换为字符串,并去除掉strip()中的字符,此处括号中字符为空,则只做读取,不做删除
> str.strip([chars])
Return a copy of the string with the leading and trailing characters removed.
源代码:
import jieba
with open('data/data131368/test.txt', 'r', encoding='UTF-8') as novelFile:
novel = novelFile.read()
stopwords = [line.strip() for line in open('data/data131368/stop.txt', 'r', encoding='UTF-8')]
novelList = list(jieba.lcut(novel))
noveldict = {}
for word in novelList:
if word not in stopwords:
if len(word) == 1:
continue
else:
noveldict[word] = noveldict.get(word, 0)+1
#排序
noveldictSorted = list(noveldict.items())
noveldictSorted.sort(key = lambda e:e[1], reverse=True)
#输出数量在前二十的词语
for TopNumWord in noveldictSorted[0:20]:
print(TopNumWord)
输出结果:
(‘父亲’, 98)
(‘背影’, 54)
(‘作者’, 39)
(‘儿子’, 28)
(‘铁道’, 17)
(‘表现’, 17)
(‘感情’, 15)
(‘文章’, 15)
(‘橘子’, 14)
(‘散文’, 11)
(‘茶房’, 10)
(‘桔子’, 10)
(‘月台’, 10)
(‘朱自清’, 10)
(‘父子’, 10)
(‘白描’, 10)
(‘一日’, 9)
(‘语言’, 9)
(‘描写’, 9)
(‘感人’, 9)
print(novelDict) 输出:{}
print(novelDict.items()) 输出:[(),(),()…]