007_NLP_Task5

最新推荐文章于 2021-01-13 02:23:31 发布

Vampire_lover

最新推荐文章于 2021-01-13 02:23:31 发布

阅读量244

点赞数 1

本文链接：https://blog.csdn.net/Vampire_lover/article/details/90370034

版权

朴素贝叶斯（naïve beyes）

原理：

见：https://blog.csdn.net/u013710265/article/details/72780520

利用朴素贝叶斯模型进行文本分类：

#coding = utf-8
#Author:Shanv
#function:
import pandas as pd
import numpy as np
import datetime


#构建词向量
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList, classVec

#将每篇文档返回的新词集合添加到一个集合中去，词不重复
def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return sorted(list(vocabSet))

#构建词向量输入词汇表和文档，输出文档向量，向量的每个元素为1或0，
# 分别表示词汇表中的单词在输入文档中是否出现。先创建一个和词汇表等长的向量，
# 遍历文档中的所有单词，如果除夕拿了词汇表中的单词，则将输出的文档向量中的对应值设为1.

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def bagOfWords2VecMN(vocabList, inputSet):
    #文档词袋模型
    articleVec = []
    for article in inputSet:
        returnVec = [0] * len(vocabList)
        for word in article:
            if word in vocabList:
                returnVec[vocabList.index(word)] += 1 #文档词袋模型
            else: print("the word: %s is not in my Vocabulary!" % word)
        articleVec.append(returnVec)

    return articleVec

#从词向量计算概率

#朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix,trainCategory):

    #利用贝叶斯分类器对文档进行分类时，要计算多个概率的成绩以获得文档属于某个类别的概率，即计算
    #p(w1/1)p(w2/1)p(w3/1)...。如果其中一个概率值为0，那么最后的乘积也为0，为降低这种影响，可以将
    # 所有词的出现次数初始化为1，并将坟墓初始化为2.
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    # p0Num = np.ones(numWords)
    # p1Num = np.ones(numWords)
    p0Num = np.ones(numWords)      #change to ones()
    p1Num = np.ones(numWords)      #change to ones()
    # p0Denom = 0.0; p1Denom = 0.0
    p0Denom = 2.0; p1Denom = 2.0   #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    #另一个问题是下溢出，这是由于太多很小的数相乘造成的，因为p(w1/1)p(w2/1)p(w3/1)...p(wN/1)大部分
    # 因子都非常小，所以会下溢出或者得到不正确的答案（0）。解决办法是对乘积去自然对数。且不会有损失。
    # p1Vect = p1Num / p1Denom
    # p0Vect = p0Num / p0Denom
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect, p1Vect, pAbusive

#朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

if __name__ == '__main__':
    startTime = datetime.datetime.now()
    print('start')

    # listPost, listClasses = loadDataSet()
    # myVocabList = createVocabList(listPost)
    # print(myVocabList)
    # print(len(myVocabList))
    #
    # Vec_mat = bagOfWords2VecMN(myVocabList, listPost)
    # print(Vec_mat)
    #
    # p0V, p1V, pAb = trainNB0(Vec_mat, listClasses)
    # print(pAb)
    # print(p1V)

    testingNB()

    endTime = datetime.datetime.now()

    totalTime = (endTime - startTime).seconds
    print(startTime, '--------', endTime)
    print('共消耗%d秒' % totalTime)

输出：

SVM模型

原理：

见：https://www.cnblogs.com/pinard/p/6097604.html

利用SVM模型进行文本分类：

#coding = utf-8
#Author:Shanv
#function:
import pandas as pd
import numpy as np
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from prettytable import PrettyTable

if __name__ == '__main__':
    startTime = datetime.datetime.now()
    print('start')
    postingList = ['my dog has flea problems help please',
                   'maybe not take him to dog park stupid',
                   'my dalmation is so cute I love him',
                   'stop posting stupid worthless garbage',
                   'mr licks ate my steak how to stop him',
                   'quit buying worthless dog food stupid']
    classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not

    vector = TfidfVectorizer()
    vector.fit(postingList)
    train_tfidf = vector.transform(postingList)
    testEntry = ['love my dalmation',
                 'stupid garbage']
    test_tfidf = vector.transform(testEntry)
    clf = svm.SVC()
    clf.fit(train_tfidf,classVec)
    result = clf.predict(test_tfidf)
    print(result)
    tb = PrettyTable()
    tb.add_column('测试句子',testEntry)
    tb.add_column('所属类别', result)
    print(tb)



    endTime = datetime.datetime.now()

    totalTime = (endTime - startTime).seconds
    print(startTime, '--------', endTime)
    print('共消耗%d秒' % totalTime)

输出：

LDA主题模型

pLSA：http://www.cnblogs.com/bentuwuying/p/6219970.html

共轭先验分布：https://www.jianshu.com/p/bb7bce40a15a

使用LDA生成主题特征

#coding = utf-8
#Author:Shanv
#function:
import pandas as pd
import numpy as np
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

if __name__ == '__main__':
    startTime = datetime.datetime.now()
    print('start')
    postingList = ['my dog has flea problems help please',
                   'maybe not take him to dog park stupid',
                   'my dalmation is so cute I love him',
                   'stop posting stupid worthless garbage',
                   'mr licks ate my steak how to stop him',
                   'quit buying worthless dog food stupid']
    conVec = CountVectorizer()
    cntTf = conVec.fit_transform(postingList)
    print(cntTf)#第0个列表元素，**词典中索引为3的元素**， 词频
    print(len(conVec.get_feature_names()))
    lda = LatentDirichletAllocation(n_topics=2,
                                    max_iter=50,
                                    random_state=0)
    result = lda.fit_transform(cntTf)
    print(result)
    print(lda.components_)


    endTime = datetime.datetime.now()

    totalTime = (endTime - startTime).seconds
    print(startTime, '--------', endTime)
    print('共消耗%d秒' % totalTime)

输出：

(0, 20)   1
(0, 9)   1
(0, 22)   1
DeprecationWarning)
(0, 5)   1
(0, 8)   1
(0, 4)   1
(0, 17)   1
(1, 27)   1
(1, 19)   1
E:\Anaconda3\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
(1, 29)   1
(1, 10)   1
(1, 28)   1
DeprecationWarning)
(1, 18)   1
(1, 15)   1
(1, 4)   1
(2, 14)   1
(2, 2)   1
(2, 24)   1
(2, 12)   1
(2, 3)   1
(2, 10)   1
(2, 17)   1
(3, 7)   1
(3, 30)   1
(3, 21)   1
(3, 26)   1
(3, 27)   1
(4, 11)   1
(4, 25)   1
(4, 0)   1
(4, 13)   1
(4, 16)   1
(4, 26)   1
(4, 29)   1
(4, 10)   1
(4, 17)   1
(5, 6)   1
(5, 1)   1
(5, 23)   1
(5, 30)   1
(5, 27)   1
(5, 4)   1
31
[[0.92639376 0.07360624]
[0.06612091 0.93387909]
[0.9333989 0.0666011 ]
[0.09046681 0.90953319]
[0.94207392 0.05792608]
[0.07491194 0.92508806]]
[[1.49365323 0.50927624 1.49328501 1.49095433 1.46033301 1.49139637
0.50944135 0.50880935 1.49115491 1.49101833 2.51012186 1.49212922
1.48955188 1.4929011 1.4934558 0.50646043 1.49366054 3.47706696
0.50694741 0.50611621 1.49164314 0.50982271 1.4898492 0.50750537
1.49111169 1.49206142 1.50074828 0.50993049 0.50698316 1.4953623
0.50969679]
[0.50795946 1.49052036 0.5078402 0.50774569 2.523933 0.50774341
1.49141175 1.49176891 0.50921497 0.50947059 1.47604112 0.5074536
0.50875301 0.50672472 0.50892791 1.49283421 0.50640215 0.51044215
1.49225453 1.49221378 0.51014977 1.49129086 0.50989447 1.49156271
0.50877271 0.50670199 1.49160454 3.47513722 1.49181711 1.49908467
2.48368876]]
2019-05-20 19:43:01.917556 -------- 2019-05-20 19:43:01.963433
共消耗0秒