train.py
import os
import shutil
import jieba
import pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import savefile, readfile, readbunchobj, writebunchobj
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.externals import joblib
def corpus_segment(corpus_path, seg_path):
catelist = os.listdir(corpus_path)
print("分词中...")
if os.path.exists(seg_path):
shutil.rmtree(seg_path)
for mydir in catelist:
class_path = corpus_path + mydir + "/"
seg_dir = seg_path + mydir + "/"
if not os.path.exists(seg_dir):
os.makedirs(seg_dir)
else:
shutil.rmtree(seg_dir)
os.makedirs(seg_dir)
file_list = os.listdir(class_path)
for file_path in file_list:
fullname = class_path + file_path
content = readfile(fullname)
content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()
content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()
content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()
content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()
content_seg = jieba.cut(content)
savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8'))
print("中文语料分词结束!!!")
def corpus2Bunch(wordbag_path, seg_path):
catelist = os.listdir(seg_path)
bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
bunch.target_name.extend(catelist)
for mydir in catelist:
class_path = seg_path + mydir + "/"
file_list = os.listdir(class_path)
for file_path in file_list:
fullname = class_path + file_path
bunch.label.append(mydir)
bunch.filenames.append(fullname)
bunch.contents.append(readfile(fullname))
with open(wordbag_path, "wb") as file_obj:
pickle.dump(bunch, file_obj)
print("构建文本对象结束!!!")
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
stpwrdlst = readfile(stopword_path).splitlines()
bunch = readbunchobj(bunch_path)
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
vocabulary={})
if train_tfidf_path is not None:
trainbunch = readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True,
vocabulary=trainbunch.vocabulary)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
else:
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
writebunchobj(space_path, tfidfspace)
print("tf-idf词向量空间实例创建成功!!!")
if __name__ == "__main__":
train_corpus_path = "train_corpus/"
stopword_path = "stopword.txt"
train_seg_path = "train_corpus_seg/"
train_bunch_path = "train_word_bag/train_set.dat"
train_space_path = "train_word_bag/tfdifspace.dat"
if not os.path.exists('clf_model_chinese.m'):
corpus_segment(train_corpus_path, train_seg_path)
if not os.path.exists('train_word_bag/'):
os.makedirs('train_word_bag/')
corpus2Bunch(train_bunch_path, train_seg_path)
vector_space(stopword_path, train_bunch_path, train_space_path)
'''# 对测试集进行分词
test_seg_path = "test_corpus_seg/" # 分词后分类语料库路径
corpus_segment(test_corpus_path, test_seg_path)
# 对测试集进行Bunch化操作
test_bunch_path = "test_word_bag/test_set.dat" # Bunch存储路径
if not os.path.exists('test_word_bag/'):
os.makedirs('test_word_bag/')
corpus2Bunch(test_bunch_path, test_seg_path)
# 创建测试集tfidf词向量空间
test_space_path = "test_word_bag/tfdifspace.dat"
vector_space(stopword_path, test_bunch_path, test_space_path, train_space_path)'''
train_set = readbunchobj(train_space_path)
clf = MultinomialNB(alpha=0.0001).fit(train_set.tdm, train_set.label)
joblib.dump(clf, "clf_model_chinese.m")
else:
print('已完成过训练')
'''
# 导入测试集
testpath = "test_word_bag/tfdifspace.dat"
test_set = readbunchobj(testpath)
# 模型载入
clf = joblib.load('clf_model_chinese.m')
# 预测分类结果
predicted = clf.predict(test_set.tdm)
# 输出分类结果
for flabel, file_name, expct_cate in zip(test_set.label, test_set.filenames, predicted):
print(file_name, " -->预测类别:", expct_cate)
print("预测完毕!!!")
# 计算分类精度
def metrics_result(actual, predict):
print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict, average='weighted')))
print('召回:{0:.3f}'.format(metrics.recall_score(actual, predict, average='weighted')))
print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted')))
metrics_result(test_set.label, predicted)'''
tools.py
import pickle
import os
import re
import shutil
import jieba
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
def savefile(savepath, content):
with open(savepath, "wb") as fp:
fp.write(content)
def readfile(path):
with open(path, "rb") as fp:
content = fp.read()
return content
def writebunchobj(path, bunchobj):
with open(path, "wb") as file_obj:
pickle.dump(bunchobj, file_obj)
def readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
classification.py
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import readfile, readbunchobj
from sklearn.externals import joblib
def Model(modelpath):
clf = joblib.load(modelpath)
return clf
def ClassificationCN(text, stoppath="stopword.txt",modelpath='clf_model_chinese.m', vocabularypath="train_word_bag/tfdifspace.dat"):
'''模型载入'''
clf = Model(modelpath)
'''文本分类'''
text = text.encode('utf-8')
text = text.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()
text = text.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()
text_seg = jieba.cut(text)
text = ""
for ge in text_seg:
text = text + " " + ge
text = [text]
trainbunch = readbunchobj(vocabularypath)
stpwrdlst = readfile(stoppath).splitlines()
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary)
tfidf = vectorizer.fit_transform(text)
predicted = clf.predict(tfidf)
predicted = predicted[0]
return predicted