# encoding: utf-8
import sys
import os
import jieba
import importlib
importlib.reload(sys)
def savefile(savepath,content):
fp = open(savepath,"w",encoding='gb2312',errors='ignore')
fp.write(content)
fp.close()
def readfile(path):
fp = open(path,"r",encoding='gb2312',errors='ignore')
content = fp.read()
fp.close()
return content
corpus_path = "F://研究生项目//文本分类语料库//"
seg_path = "F://研究生项目//分词后的文本语料库//"
catelist = os.listdir(corpus_path)
for mydir in catelist:
class_path = corpus_path+mydir+"/"
seg_dir = seg_path+mydir+"/"
if not os.path.exists(seg_dir):
os.makedirs(seg_dir)
file_list = os.listdir(class_path)
for file_path in file_list:
fullname = class_path+"/"+file_path
content = readfile(fullname).strip()
print(content)
content = content.replace("\r\n","").strip()
content_seg = jieba.cut(content)
savefile(seg_dir+"/"+file_path," ".join(content_seg))
print("中文语料分词结束")