决策树（CART算法）针对中文文本分类

最新推荐文章于 2022-10-09 17:49:32 发布

微电子学与固体电子学-俞驰

最新推荐文章于 2022-10-09 17:49:32 发布

阅读量4.4k

点赞数 1

分类专栏： Python自然语言处理

Python自然语言处理专栏收录该内容

60 篇文章 0 订阅

订阅专栏

改编自博客：

http://blog.csdn.net/github_36326955/article/details/54891204

根据下面的参考了链接可知，sklearn中的决策树用的是CART算法

http://sofasofa.io/forum_main_post.php?postid=1000402&

做个笔记

代码按照1 2 3 4的顺序进行即可：

1.py(corpus_segment.py)

[python]view plain copy
 #!/usr/bin/env python  
 # -*- coding: UTF-8 -*-  
 """ 
 @version: python2.7.8  
 @author: XiangguoSun 
 @contact: sunxiangguodut@qq.com 
 @file: corpus_segment.py 
 @time: 2017/2/5 15:28 
 @software: PyCharm 
 """  
 import sys  
 import os  
 import jieba  
 # 配置utf-8输出环境  
 reload(sys)  
 sys.setdefaultencoding('utf-8')  
 # 保存至文件  
 def savefile(savepath, content):  
     with open(savepath, "wb") as fp:  
         fp.write(content)  
     ''''' 
     上面两行是python2.6以上版本增加的语法，省略了繁琐的文件close和try操作 
     2.5版本需要from __future__ import with_statement 
     新手可以参考这个链接来学习http://zhoutall.com/archives/325 
     '''  
 # 读取文件  
 def readfile(path):  
     with open(path, "rb") as fp:  
         content = fp.read()  
     return content  
   
 def corpus_segment(corpus_path, seg_path):  
     ''''' 
     corpus_path是未分词语料库路径 
     seg_path是分词后语料库存储路径 
     '''  
     catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录  
     ''''' 
     其中子目录的名字就是类别名，例如： 
     train_corpus/art/21.txt中，'train_corpus/'是corpus_path，'art'是catelist中的一个成员 
     '''  
   
     # 获取每个目录（类别）下所有的文件  
     for mydir in catelist:  
         ''''' 
         这里mydir就是train_corpus/art/21.txt中的art（即catelist中的一个类别） 
         '''  
         class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径如：train_corpus/art/  
         seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径如：train_corpus_seg/art/  
   
         if not os.path.exists(seg_dir):  # 是否存在分词目录，如果没有则创建该目录  
             os.makedirs(seg_dir)  
   
         file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本  
         ''''' 
         train_corpus/art/中的 
         21.txt, 
         22.txt, 
         23.txt 
         ... 
         file_list=['21.txt','22.txt',...] 
         '''  
         for file_path in file_list:  # 遍历类别目录下的所有文件  
             fullname = class_path + file_path  # 拼出文件名全路径如：train_corpus/art/21.txt  
             content = readfile(fullname)  # 读取文件内容  
             '''''此时，content里面存贮的是原文本的所有字符，例如多余的空格、空行、回车等等， 
             接下来，我们需要把这些无关痛痒的字符统统去掉，变成只有标点符号做间隔的紧凑的文本内容 
             '''  
             content = content.replace("\r\n", "")  # 删除换行  
             content = content.replace(" ", "")#删除空行、多余的空格  
             content_seg = jieba.cut(content)  # 为文件内容分词  
             savefile(seg_dir + file_path, " ".join(content_seg))  # 将处理后的文件保存到分词后语料目录  
   
     print "中文语料分词结束！！！"  
   
 ''''' 
 如果你对if __name__=="__main__":这句不懂，可以参考下面的文章 
 http://imoyao.lofter.com/post/3492bc_bd0c4ce 
 简单来说如果其他python文件调用这个文件的函数，或者把这个文件作为模块 
 导入到你的工程中时，那么下面的代码将不会被执行，而如果单独在命令行中 
 运行这个文件，或者在IDE（如pycharm）中运行这个文件时候，下面的代码才会运行。 
 即，这部分代码相当于一个功能测试。 
 如果你还没懂，建议你放弃IT这个行业。 
 '''  
 if __name__=="__main__":  
     #对训练集进行分词  
     corpus_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/"  # 未分词分类语料库路径  
     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"  # 分词后分类语料库路径,本程序输出结果  
     corpus_segment(corpus_path,seg_path)  
   
     #对测试集进行分词  
     corpus_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/"  # 未分词分类语料库路径  
     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"  # 分词后分类语料库路径，本程序输出结果  
     corpus_segment(corpus_path,seg_path)  

2.py(corpus2Bunch.py)

[python]view plain copy
 #!/usr/bin/env python  
 # -*- coding: UTF-8 -*-  
 """ 
 @version: python2.7.8  
 @author: XiangguoSun 
 @contact: sunxiangguodut@qq.com 
 @file: corpus2Bunch.py 
 @time: 2017/2/7 7:41 
 @software: PyCharm 
 """  
 import sys  
 reload(sys)  
 sys.setdefaultencoding('utf-8')  
 import os#python内置的包，用于进行文件目录操作，我们将会用到os.listdir函数  
 import cPickle as pickle#导入cPickle包并且取一个别名pickle  
 ''''' 
 事实上python中还有一个也叫作pickle的包，与这里的名字相同了，无所谓 
 关于cPickle与pickle，请参考博主另一篇博文： 
 python核心模块之pickle和cPickle讲解 
 http://blog.csdn.net/github_36326955/article/details/54882506 
 本文件代码下面会用到cPickle中的函数cPickle.dump 
 '''  
 from sklearn.datasets.base import Bunch  
 #这个您无需做过多了解，您只需要记住以后导入Bunch数据结构就像这样就可以了。  
 #今后的博文会对sklearn做更有针对性的讲解  
   
   
 def _readfile(path):  
     '''''读取文件'''  
     #函数名前面带一个_,是标识私有函数  
     # 仅仅用于标明而已，不起什么作用，  
     # 外面想调用还是可以调用，  
     # 只是增强了程序的可读性  
     with open(path, "rb") as fp:#with as句法前面的代码已经多次介绍过，今后不再注释  
         content = fp.read()  
     return content  
   
 def corpus2Bunch(wordbag_path,seg_path):  
     catelist = os.listdir(seg_path)# 获取seg_path下的所有子目录，也就是分类信息  
     #创建一个Bunch实例  
     bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])  
     bunch.target_name.extend(catelist)  
     ''''' 
     extend(addlist)是python list中的函数，意思是用新的list（addlist）去扩充 
     原来的list 
     '''  
     # 获取每个目录下所有的文件  
     for mydir in catelist:  
         class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径  
         file_list = os.listdir(class_path)  # 获取class_path下的所有文件  
         for file_path in file_list:  # 遍历类别目录下文件  
             fullname = class_path + file_path  # 拼出文件名全路径  
             bunch.label.append(mydir)  
             bunch.filenames.append(fullname)  
             bunch.contents.append(_readfile(fullname))  # 读取文件内容  
             '''''append(element)是python list中的函数，意思是向原来的list中添加element，注意与extend()函数的区别'''  
     # 将bunch存储到wordbag_path路径中  
     with open(wordbag_path, "wb") as file_obj:  
         pickle.dump(bunch, file_obj)  
     print "构建文本对象结束！！！"  
   
 if __name__ == "__main__":#这个语句前面的代码已经介绍过，今后不再注释  
     #对训练集进行Bunch化操作：  
     wordbag_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"  # Bunch存储路径，程序输出  
     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"  # 分词后分类语料库路径，程序输入  
     corpus2Bunch(wordbag_path, seg_path)  
   
     # 对测试集进行Bunch化操作：  
     wordbag_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"  # Bunch存储路径，程序输出  
     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"  # 分词后分类语料库路径，程序输入  
     corpus2Bunch(wordbag_path, seg_path)  

3.py(TFIDF_space.py)

[python]view plain copy
 #!/usr/bin/env python  
 # -*- coding: UTF-8 -*-  
 """ 
 @version: python2.7.8  
 @author: XiangguoSun 
 @contact: sunxiangguodut@qq.com 
 @file: TFIDF_space.py 
 @time: 2017/2/8 11:39 
 @software: PyCharm 
 """  
 import sys  
 reload(sys)  
 sys.setdefaultencoding('utf-8')  
   
 from sklearn.datasets.base import Bunch  
 import cPickle as pickle  
 from sklearn.feature_extraction.text import TfidfVectorizer  
   
 def _readfile(path):  
     with open(path, "rb") as fp:  
         content = fp.read()  
     return content  
   
 def _readbunchobj(path):  
     with open(path, "rb") as file_obj:  
         bunch = pickle.load(file_obj)  
     return bunch  
   
 def _writebunchobj(path, bunchobj):  
     with open(path, "wb") as file_obj:  
         pickle.dump(bunchobj, file_obj)  
   
 def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):  
   
     stpwrdlst = _readfile(stopword_path).splitlines()  
     bunch = _readbunchobj(bunch_path)  
     tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})  
   
     if train_tfidf_path is not None:  
         trainbunch = _readbunchobj(train_tfidf_path)  
         tfidfspace.vocabulary = trainbunch.vocabulary  
         vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)  
         tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
   
     else:  
         vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)  
         tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
         tfidfspace.vocabulary = vectorizer.vocabulary_  
   
     _writebunchobj(space_path, tfidfspace)  
     print "tf-idf词向量空间实例创建成功！！！"  
   
 if __name__ == '__main__':  
   
     # stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件  
     # bunch_path = "train_word_bag/train_set.dat"#输入的文件  
     # space_path = "train_word_bag/tfdifspace.dat"#输出的文件  
     # vector_space(stopword_path,bunch_path,space_path)  
     #  
     # bunch_path = "test_word_bag/test_set.dat"#输入的文件  
     # space_path = "test_word_bag/testspace.dat"  
     # train_tfidf_path="train_word_bag/tfdifspace.dat"  
     # vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)  
   
     stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件  
   
     train_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#输入的文件  
     space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#输出的文件  
     vector_space(stopword_path,train_bunch_path,space_path)  
   
     train_tfidf_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"  # 输入的文件，由上面生成  
     test_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#输入的文件  
     test_space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"#输出的文件  
   
     vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path) 
 

4.py

#!/usr/bin/env python  
# -*- coding: UTF-8 -*-  

import sys  
reload(sys)  
sys.setdefaultencoding('utf-8')  
  
import cPickle as pickle  
from sklearn.naive_bayes import MultinomialNB  # 导入多项式贝叶斯算法  
  
  
# 读取bunch对象  
def _readbunchobj(path):  
    with open(path, "rb") as file_obj:  
        bunch = pickle.load(file_obj)  
    return bunch  
  
# 导入训练集  
trainpath = "../train_word_bag/tfidfspace.dat"  
train_set = _readbunchobj(trainpath)  
  
# 导入测试集  
testpath = "../test_word_bag/testspace.dat"  
test_set = _readbunchobj(testpath)  
  
# 训练分类器：输入词袋向量和分类标签，alpha:0.001 alpha越小，迭代次数越多，精度越高  
# clf = MultinomialNB(alpha=0.1).fit(train_set.tdm, train_set.label)  
  
######################################################  

from sklearn import tree
print '*************************决策树************************'  

clf = tree.DecisionTreeClassifier()
clf.fit(train_set.tdm, train_set.label)  
  
# 预测分类结果  

print '*************************开始预测************************'  
predicted = clf.predict(test_set.tdm)  
  
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):  
    if flabel != expct_cate:  
        print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate  
  
print "预测完毕!!!"  
  
# 计算分类精度：  
from sklearn import metrics  
def metrics_result(actual, predict):  
    print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))  
    print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))  
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))  
  
metrics_result(test_set.label, predicted)

依然使用复旦大学的新闻数据集

运行结果（这里复制一部分）：

../test_corpus_seg/C37-Military/C37-Military008.txt : 实际类别: C37-Military -->预测类别: C11-Space
../test_corpus_seg/C37-Military/C37-Military031.txt : 实际类别: C37-Military -->预测类别: C38-Politics
../test_corpus_seg/C37-Military/C37-Military101.txt : 实际类别: C37-Military -->预测类别: C38-Politics
../test_corpus_seg/C37-Military/C37-Military006.txt : 实际类别: C37-Military -->预测类别: C32-Agriculture
../test_corpus_seg/C37-Military/C37-Military125.txt : 实际类别: C37-Military -->预测类别: C29-Transport
预测完毕!!!
精度:0.878
召回:0.879
f1-score:0.878

微电子学与固体电子学-俞驰

关注

1
点赞
踩
18

收藏

觉得还不错? 一键收藏
0
评论
决策树（CART算法）针对中文文本分类

改编自博客：http://blog.csdn.net/github_36326955/article/details/54891204根据下面的参考了链接可知，sklearn中的决策树用的是CART算法http://sofasofa.io/forum_main_post.php?postid=1000402&做个笔记代码按照1 2 3
复制链接

扫一扫

专栏目录