Python借助THULAC包对中文txt文档去停用词、分词`
import thulac
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子进行分词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('D:/stopwords.txt')
# 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
thu1 = thulac.thulac(seg_only=True) #只进行分词,不进行词性标注
thu1.cut_f("input.txt", "output.txt") #对input.txt文件内容进行分词,输出到output.txt