考完试了,然后总结一下。。因为时间有些久,我记不清都参考过哪些了
分词
就直接用的jieba。老师的要求是留名词,我就只留了名词(Ps.别的词性还是可以留的)
import os
import jieba.posseg as pseg
import jieba
import codecs
import myIO
def load_stopwords():
# 读取停用词表
f = open('C:/lyr/DM/stop_words_ch.txt')
sw = [line.strip() for line in f]
return sw
def cut_words(label, file_list, file_path, cut_dir):
print('Run task (%s)...' % (os.getpid()))
for j, file_name in enumerate(file_list):
fullpath = file_path + file_name
content = myIO.readfile(fullpath)
content = content.replace('\r\n'.encode('utf-8'),''.encode('utf-8')).strip()
content = content.replace(' '.encode('utf-8'),''.encode('utf-8')).strip()
content_seg = pseg.cut(content)
_write_noun(file_name, conten