将根目录下,各个类别文件内所有TXT文件分词、去除停用词后集合为一个TXT文件
#-*- coding: UTF-8 -*-
import os
import jieba
# 遍历指定目录,显示目录下的所有文件名
def eachFile(filepath):
fr = open('stopwords.txt','r') #停用词文件在当前工作目录下
stopwords_list =[]
for line in fr.readlines():
line=line.decode('utf-8').strip().split()
#print line,type(line),len(line)
line=line[0]
#print line,type(line),len(line)
stopwords_list.append(line)
pathDir = os.listdir(filepath)
dat=[]
for allDir in pathDir:
dat=''
child = os.path.join('%s%s\\' % (filepath, allDir))
wfile='D:\Documents\data\Redeced1\\'+allDir+'.txt'
fopen = open(wfile, 'w')
print child# .decode('gbk')是解决中文显示乱码问题
for x in os.listdir(child):
print x
fr=open(child+x,'r').readlines()
for y in fr:
y=y.strip('\n')
seg_list =list(jieba.cut(y))
outStr = ''
for word in seg_list:
if word not in stopwords_list :
outStr += word
outStr += ' '
dat=dat+outStr
#print dat
fopen.write(dat.encode('gbk','ignore')+'\n')
break
fopen.close()
if __name__ == '__main__':
filePath = "D:\\Documents\\data\\Reduced\\"#文件所在目录
eachFile(filePath)