朴素贝叶斯(naïve beyes)
原理:
见:https://blog.csdn.net/u013710265/article/details/72780520
利用朴素贝叶斯模型进行文本分类:
#coding = utf-8 #Author:Shanv #function: import pandas as pd import numpy as np import datetime #构建词向量 def loadDataSet(): postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0,1,0,1,0,1] #1 is abusive, 0 not return postingList, classVec #将每篇文档返回的新词集合添加到一个集合中去,词不重复 def createVocabList(dataSet): vocabSet = set([]) #create empty set for document in dataSet: vocabSet = vocabSet | set(document) #union of the two sets return sorted(list(vocabSet)) #构建词向量输入词汇表和文档,输出文档向量,向量的每个元素为1或0, # 分别表示词汇表中的单词在输入文档中是否出现。先创建一个和词汇表等长的向量, # 遍历文档中的所有单词,如果除夕拿了词汇表中的单词,则将输出的文档向量中的对应值设为1. def setOfWords2Vec(vocabList, inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print("the word: %s is not in my Vocabulary!" % word) return returnVec def bagOfWords2VecMN(vocabList, inputSet): #文档词袋模型 articleVec = [] for article in inputSet: returnVec = [0] * len(vocabList) for word in article: if word in vocabList: returnVec[vocabList.index(word)] += 1 #文档词袋模型 else: print("the word: %s is not in my Vocabulary!" % word) articleVec.append(returnVec) return articleVec #从词向量计算概率 #朴素贝叶斯分类器训练函数 def trainNB0(trainMatrix,trainCategory): #利用贝叶斯分类器对文档进行分类时,要计算多个概率的成绩以获得文档属于某个类别的概率,即计算 #p(w1/1)p(w2/1)p(w3/1)...。如果其中一个概率值为0,那么最后的乘积也为0,为降低这种影响,可以将 # 所有词的出现次数初始化为1,并将坟墓初始化为2. numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pAbusive = sum(trainCategory)/float(numTrainDocs) # p0Num = np.ones(numWords) # p1Num = np.ones(numWords) p0Num = np.ones(numWords) #change to ones() p1Num = np.ones(numWords) #change to ones() # p0Denom = 0.0; p1Denom = 0.0 p0Denom = 2.0; p1Denom = 2.0 #change to 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) #另一个问题是下溢出,这是由于太多很小的数相乘造成的,因为p(w1/1)p(w2/1)p(w3/1)...p(wN/1)大部分 # 因子都非常小,所以会下溢出或者得到不正确的答案(0)。解决办法是对乘积去自然对数。且不会有损失。 # p1Vect = p1Num / p1Denom # p0Vect = p0Num / p0Denom p1Vect = np.log(p1Num/p1Denom) #change to log() p0Vect = np.log(p0Num/p0Denom) #change to log() return p0Vect, p1Vect, pAbusive #朴素贝叶斯分类函数 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) #element-wise mult p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 def testingNB(): listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)) testEntry = ['stupid', 'garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)) if __name__ == '__main__': startTime = datetime.datetime.now() print('start') # listPost, listClasses = loadDataSet() # myVocabList = createVocabList(listPost) # print(myVocabList) # print(len(myVocabList)) # # Vec_mat = bagOfWords2VecMN(myVocabList, listPost) # print(Vec_mat) # # p0V, p1V, pAb = trainNB0(Vec_mat, listClasses) # print(pAb) # print(p1V) testingNB() endTime = datetime.datetime.now() totalTime = (endTime - startTime).seconds print(startTime, '--------', endTime) print('共消耗%d秒' % totalTime)
输出:
SVM模型
原理:
见:https://www.cnblogs.com/pinard/p/6097604.html
利用SVM模型进行文本分类:
#coding = utf-8 #Author:Shanv #function: import pandas as pd import numpy as np import datetime from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import svm from prettytable import PrettyTable if __name__ == '__main__': startTime = datetime.datetime.now() print('start') postingList = ['my dog has flea problems help please', 'maybe not take him to dog park stupid', 'my dalmation is so cute I love him', 'stop posting stupid worthless garbage', 'mr licks ate my steak how to stop him', 'quit buying worthless dog food stupid'] classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not vector = TfidfVectorizer() vector.fit(postingList) train_tfidf = vector.transform(postingList) testEntry = ['love my dalmation', 'stupid garbage'] test_tfidf = vector.transform(testEntry) clf = svm.SVC() clf.fit(train_tfidf,classVec) result = clf.predict(test_tfidf) print(result) tb = PrettyTable() tb.add_column('测试句子',testEntry) tb.add_column('所属类别', result) print(tb) endTime = datetime.datetime.now() totalTime = (endTime - startTime).seconds print(startTime, '--------', endTime) print('共消耗%d秒' % totalTime)
输出:
LDA主题模型
pLSA:http://www.cnblogs.com/bentuwuying/p/6219970.html
共轭先验分布:https://www.jianshu.com/p/bb7bce40a15a
使用LDA生成主题特征
#coding = utf-8 #Author:Shanv #function: import pandas as pd import numpy as np import datetime from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation if __name__ == '__main__': startTime = datetime.datetime.now() print('start') postingList = ['my dog has flea problems help please', 'maybe not take him to dog park stupid', 'my dalmation is so cute I love him', 'stop posting stupid worthless garbage', 'mr licks ate my steak how to stop him', 'quit buying worthless dog food stupid'] conVec = CountVectorizer() cntTf = conVec.fit_transform(postingList) print(cntTf)#第0个列表元素,**词典中索引为3的元素**, 词频 print(len(conVec.get_feature_names())) lda = LatentDirichletAllocation(n_topics=2, max_iter=50, random_state=0) result = lda.fit_transform(cntTf) print(result) print(lda.components_) endTime = datetime.datetime.now() totalTime = (endTime - startTime).seconds print(startTime, '--------', endTime) print('共消耗%d秒' % totalTime)
输出:
(0, 20) 1
(0, 9) 1
(0, 22) 1
DeprecationWarning)
(0, 5) 1
(0, 8) 1
(0, 4) 1
(0, 17) 1
(1, 27) 1
(1, 19) 1
E:\Anaconda3\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
(1, 29) 1
(1, 10) 1
(1, 28) 1
DeprecationWarning)
(1, 18) 1
(1, 15) 1
(1, 4) 1
(2, 14) 1
(2, 2) 1
(2, 24) 1
(2, 12) 1
(2, 3) 1
(2, 10) 1
(2, 17) 1
(3, 7) 1
(3, 30) 1
(3, 21) 1
(3, 26) 1
(3, 27) 1
(4, 11) 1
(4, 25) 1
(4, 0) 1
(4, 13) 1
(4, 16) 1
(4, 26) 1
(4, 29) 1
(4, 10) 1
(4, 17) 1
(5, 6) 1
(5, 1) 1
(5, 23) 1
(5, 30) 1
(5, 27) 1
(5, 4) 1
31
[[0.92639376 0.07360624]
[0.06612091 0.93387909]
[0.9333989 0.0666011 ]
[0.09046681 0.90953319]
[0.94207392 0.05792608]
[0.07491194 0.92508806]]
[[1.49365323 0.50927624 1.49328501 1.49095433 1.46033301 1.49139637
0.50944135 0.50880935 1.49115491 1.49101833 2.51012186 1.49212922
1.48955188 1.4929011 1.4934558 0.50646043 1.49366054 3.47706696
0.50694741 0.50611621 1.49164314 0.50982271 1.4898492 0.50750537
1.49111169 1.49206142 1.50074828 0.50993049 0.50698316 1.4953623
0.50969679]
[0.50795946 1.49052036 0.5078402 0.50774569 2.523933 0.50774341
1.49141175 1.49176891 0.50921497 0.50947059 1.47604112 0.5074536
0.50875301 0.50672472 0.50892791 1.49283421 0.50640215 0.51044215
1.49225453 1.49221378 0.51014977 1.49129086 0.50989447 1.49156271
0.50877271 0.50670199 1.49160454 3.47513722 1.49181711 1.49908467
2.48368876]]
2019-05-20 19:43:01.917556 -------- 2019-05-20 19:43:01.963433
共消耗0秒