分词、去停用词
#https://github.com/xgli/jieba
import os
import jieba
# 未分词语料库路径
corpus_path =r' '
# 分词后语料库路径
seg_path = r' '
# 停用词路径
stop_list_Path = r' '
def stopwordsList(stop_list_Path):
f = open(stop_list_Path,'r',encoding='utf-8')
stopwords = [line.strip() for line in f.readlines()]
return stopwords
def readfile(filepath):
f = open(filepath,'r',encoding='gb2312',errors='ignore')
content = f.read()
# read()返回的是字符串,读全文本的内容。readline()返回一行,是字符串类型。readlines()读取所有行,保存在列表中
f.close()
return content
# 这里返回整个文本,以便后续进行分词
def savefile(seg_path,content):
f = open(seg_path,'w',encoding='utf-8')
f.write(content)
f.close()
def tikenizer_and_removeStoplist(corpus_path,stop_list_Path):
cate_dir = os.listdir(corpus_path) # 获取子类别目录
for cate in cate_dir: