# 显示处理流程
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 停用词文档
stopwords_path = "G:/1研究生/news_stopwords.txt"
"""创建停用词列表"""
def stopwordslist():
stopwords = [line.strip() for line in open(stopwords_path,encoding='UTF-8').readlines()]
return stopwords
# 对句子进行中文分词
def seg_depart(sentence):
sentence_depart = jieba.cut(sentence.strip())
stopwords = stopwordslist()
outstr = ''
# 去停用词
for word in sentence_depart:
if word not in stopwords and len(word)>1:
outstr += word
outstr += " "
return outstr
"""如果文档还没分词,就进行分词"""
count=0
if not os.path.exists(outfilename):
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
# 把非汉字的字符全部去掉
# 将输出结果写入ouputs.txt中
for line in inputs:
line = line.split('\t')[1]
line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
line_seg = seg_depart(line.strip())
outputs.write(line_seg.strip() + '\n')
count+=1
if(count%200==0):
print(count)
outputs.close()
inputs.close()
print("删除停用词和分词成功!!!")
代码!以备不时之需!中文文本预处理(停用词、空格分隔、按行分类)
最新推荐文章于 2024-03-22 12:59:16 发布