目前的课题需要用结巴分词处理一下数据,其中要去掉其中的停用词,以下是代码:
import jieba
import os
import pymysql
def fun(filepath): # 遍历文件夹中的所有文件,返回文件list
arr = []
for root, dirs, files in os.walk(filepath):
for fn in files:
arr.append(root+"\\"+fn)
return arr
#创建停用词表
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子去除停用词
def movestopwords(sentence):
stopwords = stopwordslist('D:/2181729/stop_words.txt') # 这里加载停用词的路径
santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]
return santi_words
def segmentor(text):
words = jieba.cut(text, cut_all=False)
return words
#stopwords = {}.fromkeys(['的', '包括', '等', '是', '《', '》', '(', ')', '.', '、', '。'])
stopwords = stopwordslist('D:/2181729/stop_words.txt')
filepath = r'D:/2181729/data'
filelist = fun(filepath) # 获取文件列表
text = ""
count = 0
print(len(filelist))
#f1 = open('D:/2181729/nerfcdata/1.txt', 'a+')
for file in filelist:
with open(file, encoding='UTF-8')as f:
for line in f:
segs = jieba.cut(line, cut_all=False)
for seg in segs:
if seg not in stopwords:
text += seg
words = segmentor(text)
#print('/'.join(words))
count += 1
output = '/'.join(words)
dir='D:/2181729/nerfcdata/'+f.name[-5:]
with open(dir, 'w', encoding='UTF-8') as f1:
print(output)
f1.write(output)