#encoding=utf-8
filename = "stop_words.txt"
f = open(filename,"r",encoding='utf-8')
result = list()
forlinein f.readlines():
line = line.strip()
ifnotlen(line):
continue
result.append(line)
f.closewithopen("stop_words2.txt","w",encoding='utf-8') as fw:
for sentence inresult:
sentence.encode('utf-8')
data=sentence.strip()
iflen(data)!=0:
fw.write(data)
fw.write("\n")
print ("end")
分词、停用词过滤(包括标点)
#encoding=utf-8import jieba
filename = "../data/1000页洗好2.txt"
stopwords_file = "../data/stop_words2.txt"
stop_f = open(stopwords_file,"r",encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
line = line.strip()
ifnot len(line):
continue
stop_words.append(line)
stop_f.close
print(len(stop_words))
f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
line = line.strip()
ifnot len(line):
continue
outstr = ''
seg_list = jieba.cut(line,cut_all=False)
for word in seg_list:
if word notin stop_words:
if word != '\t':
outstr += word
outstr += " "# seg_list = " ".join(seg_list)
result.append(outstr.strip())
f.close
with open("../data/test2.txt","w",encoding='utf-8') as fw:
for sentence in result:
sentence.encode('utf-8'