首先得对数据进行预处理。
去掉停用词以及结巴分词。将处理后的结果保存成文件。
本案使用的是天龙八部.txt
import jieba
import jieba.analyse
import jieba.posseg as pseg
stop_words = []
with open ('data/stopwords.txt','r',encoding='UTF-8') as f:
for line in f.readlines():
line = line.replace("\n","").replace("\r","").strip()
stop_words.append(line)
print(stop_words)
cut_result = open ("data/after_cut.txt",'w',encoding='UTF-8')
def cut_words(filepath):
with open (filepath,'r',encoding='UTF-8')as f:
for line in f.readlines():
word_list = []
line = line.replace("\n", "").replace("\r", "").strip()
words = jieba.lcut(line)
for word in words:
if(word not in stop_words):