TF-IDF用于文章分类

最新推荐文章于 2024-09-01 15:32:53 发布

qq_33638017

最新推荐文章于 2024-09-01 15:32:53 发布

阅读量3.4k

点赞数 2

分类专栏： # nlp 文章标签： nlp

nlp 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

转载自http://blog.csdn.net/orlandowww/article/details/52706135
下载：
停用词集合：
http://download.csdn.net/download/kevinelstri/9817721
分词语料训练集：
http://download.csdn.net/download/github_36326955/9747927
1）TF-IDF
TF=某个词组在文章中出现次数/该文章的总词组数
IDF=log(语料库中文档总数/（包含该词组的文章总数+1））
计算：
参考http://www.voidcn.com/article/p-bbabkmsv-pt.html

# coding:utf-8
from sklearn.feature_extraction.text import CountVectorizer

#语料
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
#将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer()
#计算个词语出现的次数
X = vectorizer.fit_transform(corpus)
#获取词袋中所有文本关键词
word = vectorizer.get_feature_names()
print word
#查看词频结果
print X.toarray()

from sklearn.feature_extraction.text import TfidfTransformer

#类调用
transformer = TfidfTransformer()
print transformer
#将词频矩阵X统计成TF-IDF值
tfidf = transformer.fit_transform(X)
#查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重
print tfidf.toarray()

2）分词
使用python包jieba

# -*- encoding: utf-8 -*-

import sys
import os
import jieba
reload(sys)  
sys.setdefaultencoding('utf-8') 

# 保存文件
def savefile(savepath,content):
    with open(savepath,"wb") as fp:
        fp.write(content)
# 读取文件
def readfile(path):
    with open(path,"rb") as fp:
        content=fp.read()
    return content
def corpus_segment(corpus_path,seg_path):
    for parent,dirnames,filenames in os.walk(corpus_path):
        for filename in filenames:
            if ".txt" in filename:
                filepath=os.path.join(parent,filename)
                content=readfile(filepath)
                content=content.replace("\r\n","") #删除换行
                content=content.replace(" ","") #删除空格
                content_seg=jieba.cut(content) #为文件内容分词
                savefile(seg_path+filename," ".join(content_seg))
    print "中文语料分词结束\n"

if __name__=="__main__":
    #对训练集进行分词
    corpus_path="./train_corpus/"
    seg_path="./train_corpus_seg/"
    corpus_segment(corpus_path,seg_path)

3)将文本文件转为bunch

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
reload(sys)  
sys.setdefaultencoding('utf-8') 
import cPickle as pickle
from sklearn.datasets.base import Bunch

# 读取文件
def readfile(path):
    with open(path,"rb") as fp:
        content=fp.read()
    return content
def corpus2Bunch(wordbag_path,seg_path):
    #创建一个Bunch实例
    bunch=Bunch(target_name=[],label=[],filenames=[],contents=[])
    regpat=re.compile('-(\w+)\.txt')
    for parent,dirnames,filenames in os.walk(seg_path):
        for filename in filenames:
            if ".txt" in filename:
                filepath=os.path.join(parent,filename)
                templabel=regpat.search(filename)
                templabel=templabel.group(1)[:-4]
                bunch.label.append(templabel)
                if templabel not in bunch.target_name:
                    bunch.target_name.append(templabel)
                bunch.filenames.append(filepath)
                bunch.contents.append(readfile(filepath))
    print bunch.target_name
    print bunch.filenames[:10]
    print bunch.label[:10]
    with open(wordbag_path,"wb") as file_obj:
        pickle.dump(bunch,file_obj)
    print "构建文本对象结束\n"
if __name__=="__main__":
    wordbag_path="train_word_bag/train_set.dat" #Bunch保存路径
    seg_path="train_corpus_seg/" #分词后分类语料库路径
    corpus2Bunch(wordbag_path,seg_path)

4）生成TF-IDF向量空间

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
reload(sys)  
sys.setdefaultencoding('utf-8') 
import cPickle as pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer 
# 读取文件
def readfile(path):
    with open(path,"rb") as fp:
        content=fp.read()
    return content
#读取bunch对象
def readbunchobj(path):
    with open(path,"rb") as file_obj:
        bunch=pickle.load(file_obj)
    return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
    with open(path,"wb") as file_obj:
        pickle.dump(bunchobj,file_obj)

def vector_space(stopword_path,bunch_path,space_path):
    stpwrdlst=readfile(stopword_path).splitlines() #读取停用词
    bunch=readbunchobj(bunch_path) #导入分词后的词向量bunch对象
    #构建tf-idf词向量空间对象
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
    '''
    tdm存放的是计算后得到的TF-IDF权重矩阵.
    vocabulary是词向量空间的索引，例如，如果我们定义的词向量空间是（我，喜欢，相国大人），那么vocabulary就是这样一个索引字典 
    vocabulary={"我":0,"喜欢":1,"相国大人":2}，你可以简单的理解为：vocabulary就是词向量空间的坐标轴，索引值相当于表明了第几个维度。 
    '''
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
    ''''' 
    关于参数，你只需要了解这么几个就可以了： 
    stop_words: 
    传入停用词，以后我们获得vocabulary_的时候，就会根据文本信息去掉停用词得到 
    vocabulary: 
    之前说过，不再解释。 
    sublinear_tf: 
    计算tf值采用亚线性策略。比如，我们以前算tf是词频，现在用1+log(tf)来充当词频。 
    smooth_idf: 
    计算idf的时候log(分子/分母)分母有可能是0，smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启，无需关心。 
    norm: 
    归一化，我们计算TF-IDF的时候，是用TF*IDF，TF可以是归一化的，也可以是没有归一化的，一般都是采用归一化的方法，默认开启. 
    max_df: 
    有些词，他们的文档频率太高了（一个词如果每篇文档都出现，那还有必要用它来区分文本类别吗？当然不用了呀），所以，我们可以 
    设定一个阈值，比如float类型0.5（取值范围[0.0,1.0]）,表示这个词如果在整个数据集中超过50%的文本都出现了，那么我们也把它列 
    为临时停用词。当然你也可以设定为int型，例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了，那么我们也把它列 
    为临时停用词。 
    min_df: 
    与max_df相反，虽然文档频率越低，似乎越能区分文本，可是如果太低，例如10000篇文本中只有1篇文本出现过这个词，仅仅因为这1篇 
    文本，就增加了词向量空间的维度，太不划算。 
    当然，max_df和min_df在给定vocabulary参数时，就失效了。 
    '''  
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) 
    tfidfspace.vocabulary = vectorizer.vocabulary_ 
    writebunchobj(space_path, tfidfspace)  
    print "if-idf词向量空间实例创建成功！！！"
if __name__=="__main__":
    stopword_path="train_word_bag/stopword.txt"
    bunch_path="train_word_bag/train_set.dat" #导入训练集bunch的路径
    space_path="train_word_bag/tfidfspace.dat" #词向量空间保存路径
    vector_space(stopword_path,bunch_path,space_path)

5）贝叶斯分类器
贝叶斯定理指：对于事件A和B，它们之间的概率关系满足：
P(A|B)=P(B|A)P(A)P(B)
贝叶斯分类原理：
这里写图片描述
5.1高斯朴素贝叶斯

5.2多项式贝叶斯

5.3伯努利模型
伯努利模型中，每个特征取值为0或者1
6）文章分类器

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
import numpy as np
reload(sys)  
sys.setdefaultencoding('utf-8') 
import cPickle as pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法

#读取bunch对象
def readbunchobj(path):
    with open(path,"rb") as file_obj:
        bunch=pickle.load(file_obj)
    return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
    with open(path,"wb") as file_obj:
        pickle.dump(bunchobj,file_obj)
#导入训练集
trainpath="train_word_bag/tfidfspace.dat"
train_set=readbunchobj(trainpath)
#a=train_set.tdm.toarray()[0,:]
#indices=np.where(a!=0)
#print a[indices]
#训练分类器

# 训练分类器：输入词袋向量和分类标签，alpha:0.001
clf = MultinomialNB(alpha=0.001).fit(train_set.tdm, train_set.label) 
clf_path="train_word_bag/clf.m" #贝叶斯模型保存路径
writebunchobj(clf_path, clf)

7)预测

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
import numpy as np
reload(sys)  
sys.setdefaultencoding('utf-8') 
import cPickle as pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法

# 保存文件
def savefile(savepath,content):
    with open(savepath,"wb") as fp:
        fp.write(content)
# 读取文件
def readfile(path):
    with open(path,"rb") as fp:
        content=fp.read()
    return content
def corpus_segment(corpus_path,seg_path):
    for parent,dirnames,filenames in os.walk(corpus_path):
        for filename in filenames:
            if ".txt" in filename:
                filepath=os.path.join(parent,filename)
                content=readfile(filepath)
                content=content.replace("\r\n","") #删除换行
                content=content.replace(" ","") #删除空格
                content_seg=jieba.cut(content) #为文件内容分词
                savefile(seg_path+filename," ".join(content_seg))
    print "中文语料分词结束\n"
def corpus2Bunch(wordbag_path,seg_path):
    #创建一个Bunch实例
    bunch=Bunch(target_name=[],label=[],filenames=[],contents=[])
    regpat=re.compile('-(\w+)\.txt')
    for parent,dirnames,filenames in os.walk(seg_path):
        for filename in filenames:
            if ".txt" in filename:
                filepath=os.path.join(parent,filename)
                templabel=regpat.search(filename)
                templabel=templabel.group(1)[:-4]
                bunch.label.append(templabel)
                if templabel not in bunch.target_name:
                    bunch.target_name.append(templabel)
                bunch.filenames.append(filepath)
                bunch.contents.append(readfile(filepath))
    print bunch.target_name
    print bunch.filenames[:10]
    print bunch.label[:10]
    with open(wordbag_path,"wb") as file_obj:
        pickle.dump(bunch,file_obj)
    print "构建文本对象结束\n"
#读取bunch对象
def readbunchobj(path):
    with open(path,"rb") as file_obj:
        bunch=pickle.load(file_obj)
    return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
    with open(path,"wb") as file_obj:
        pickle.dump(bunchobj,file_obj)

def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path):
    stpwrdlst=readfile(stopword_path).splitlines() #读取停用词
    bunch=readbunchobj(bunch_path) #导入分词后的词向量bunch对象
    #构建tf-idf词向量空间对象
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
    '''
    tdm存放的是计算后得到的TF-IDF权重矩阵.
    vocabulary是词向量空间的索引，例如，如果我们定义的词向量空间是（我，喜欢，相国大人），那么vocabulary就是这样一个索引字典 
    vocabulary={"我":0,"喜欢":1,"相国大人":2}，你可以简单的理解为：vocabulary就是词向量空间的坐标轴，索引值相当于表明了第几个维度。 
    '''
    trainbunch=readbunchobj(train_tfidf_path)
    tfidfspace.vocabulary = trainbunch.vocabulary
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
    ''''' 
    关于参数，你只需要了解这么几个就可以了： 
    stop_words: 
    传入停用词，以后我们获得vocabulary_的时候，就会根据文本信息去掉停用词得到 
    vocabulary: 
    之前说过，不再解释。 
    sublinear_tf: 
    计算tf值采用亚线性策略。比如，我们以前算tf是词频，现在用1+log(tf)来充当词频。 
    smooth_idf: 
    计算idf的时候log(分子/分母)分母有可能是0，smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启，无需关心。 
    norm: 
    归一化，我们计算TF-IDF的时候，是用TF*IDF，TF可以是归一化的，也可以是没有归一化的，一般都是采用归一化的方法，默认开启. 
    max_df: 
    有些词，他们的文档频率太高了（一个词如果每篇文档都出现，那还有必要用它来区分文本类别吗？当然不用了呀），所以，我们可以 
    设定一个阈值，比如float类型0.5（取值范围[0.0,1.0]）,表示这个词如果在整个数据集中超过50%的文本都出现了，那么我们也把它列 
    为临时停用词。当然你也可以设定为int型，例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了，那么我们也把它列 
    为临时停用词。 
    min_df: 
    与max_df相反，虽然文档频率越低，似乎越能区分文本，可是如果太低，例如10000篇文本中只有1篇文本出现过这个词，仅仅因为这1篇 
    文本，就增加了词向量空间的维度，太不划算。 
    当然，max_df和min_df在给定vocabulary参数时，就失效了。 
    '''  
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)   
    writebunchobj(space_path, tfidfspace)  
    print "if-idf词向量空间实例创建成功！！！"
def metrics_result(actual, predict):  
    from sklearn import metrics 
    print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))  
    print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))  
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))  
if __name__=="__main__":
    clf_path="train_word_bag/clf.m" #贝叶斯模型保存路径
    clf=readbunchobj(clf_path)

    #对测试集进行分词
    corpus_path="./test_corpus/"
    seg_path="./test_corpus_seg/"
    corpus_segment(corpus_path,seg_path)

    wordbag_path="test_word_bag/test_set.dat" #Bunch保存路径
    seg_path="test_corpus_seg/" #分词后分类语料库路径
    corpus2Bunch(wordbag_path,seg_path)

    stopword_path="train_word_bag/stopword.txt"
    bunch_path="test_word_bag/test_set.dat" #导入bunch的路径
    space_path="test_word_bag/tfidfspace.dat" #词向量空间保存路径
    train_tfidf_path="train_word_bag/tfdifspace.dat"
    vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)

    #导入测试集
    testpath="test_word_bag/tfidfspace.dat"
    test_set=readbunchobj(testpath)

    # 预测分类结果  
    predicted = clf.predict(test_set.tdm)
    for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):  
        if flabel != expct_cate:  
            print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate 
    print "预测完毕!!!"

    # 计算分类精度：   
    metrics_result(test_set.label, predicted)