python代码实现
主要是根据朴素贝叶斯 相互独立假设 p(w|c1)=p(w1|c1)*p(w2|c1)*p(w3|c1)......*p(wn|c1)
从而 p(c1|w)=[p(w|c1)*p(c1)]/p(w) 而p(w)等于 i 从到n 所有的 p(w|ci)*p(ci)相加,从而p(w)不变
因此只需要计算[p(w|c1)*p(c1)]
def loadDataSet(): postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0, 1, 0, 1, 0, 1] # 1代表侮辱文字,0代表正常言论 return postingList, classVec # 创建不重复所有文档词的字典列表 def createVocabList(dataSet): vocabSet = set([]) for document in dataSet: # 并集操作 vocabSet = vocabSet | set(document) return list(vocabSet) # 把文档词汇表变成文档向量 def setOfWords2Vec(vocabList, inputSet): returnVec = [0] * len(vocabList) for word in inputSet: if word in vocabList: # 获得某个词所在的索引位置 returnVec[vocabList.index(word)] = 1 else: print('the word:%s is not in my Vocabulart!' % word) return returnVec # 词袋模型 统计每个词出现的次数 def bagOfWords2VecMN(vocabList, inputSet): retrunVec = [0] * len(vocabList) for word in inputSet: if word in vocabList: retrunVec[vocabList.index(word)] += 1 return retrunVec import numpy as np # 训练数据集 def trainB0(trainMatrix, trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) # 侮辱性类型所占的比例 pAbusive = sum(trainCategory) / float(numTrainDocs) # P(x|c=0)的概率 转换为朴素贝叶斯函数 P(x1|c=0)*P(x2|c=0)....... # p0Num = np.zeros(numWords) p0Num = np.ones(numWords) # P(x|c=1)的概率 如上 # p1Num = np.zeros(numWords) p1Num = np.ones(numWords) # p0Denom = 0.0 # p1Denom = 0.0 p0Denom = 2.0 p1Denom = 2 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) # p1Vect = p1Num / p1Denom # p0Vect = p0Num / p0Denom # 防止数据出现下溢 p1Vect = np.log(p1Num / p1Denom) p0Vect = np.log(p0Num / p0Denom) return p0Vect, p1Vect, pAbusive # vec2Classify表示当前文档的词向量 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 def textTrain(): postingList, classVec = loadDataSet() dataSet = createVocabList(postingList) trainMatrix = [] for postinDoc in postingList: # 获得每个文档的向量表示,并添加到trainMatrix矩阵中 trainMatrix.append(setOfWords2Vec(dataSet, postinDoc)) # trainMatrix.append(bagOfWords2VecMN(dataSet, postinDoc)) p0v, p1v, pAb = trainB0(trainMatrix, classVec) print(p0v) print(p1v) print(pAb) # textTrain() def testingNB(): listOppsts, listClasses = loadDataSet() myVocabList = createVocabList(listOppsts) trainMat = [] for postinDoc in listOppsts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0v, p1v, pAb = trainB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) print('testEntry , classified as :%s' % (classifyNB(thisDoc, p0v, p1v, pAb))) testEntry = ['stupid', 'garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) print('testEntry , classified as :%s' % (classifyNB(thisDoc, p0v, p1v, pAb))) def textParse(bigString): import re try: # 匹配所有非 字母、数字、下划线 listOfTokens = re.split(r'\W*', bigString) except FutureWarning: print("error") return [] return [tok.lower() for tok in listOfTokens if len(tok) > 2] # 对贝叶斯垃圾邮件分类器进行自动化处理 # 交叉验证 def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 创建词汇集合 vocabList = createVocabList(docList) # 训练数据集合 trainingSet = list(range(50)) testSet = [] # 随机分割训练数据集和测试数据集 for i in range(10): # 从0-49 随机选择一个数字,其实是随机选择一个样本 randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainB0(trainMat, trainClasses) errorCount = 0 # 对于测试数据集 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print("the error rate is:", float(errorCount) / len(testSet)) import feedparser # ny = feedparser.parse('http://feed.cnblogs.com/blog/u/205667/rss') # print(ny['entries'][0]['summary']) # spamTest() # RSS源分类器及高频词去除函数 # 统计词频,并且取出从词频最高的前30个数据 def calcMostFreq(vocabList, fullText): import operator freqDict = {} for token in vocabList: freqDict[token] = fullText.count(token) sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) return sortedFreq[:20] # 从文件中加载停用词 def load_the_remov_words(): rmwords = [] with open('removeWords.txt', 'rb') as fr: for line in fr.readlines(): strline = line.strip() rmwords.append(strline) fr.close() return rmwords # //从RSS源中加载数据 def localWords(feed1, feed0): import feedparser # 文档列表 docList = [] # 类别列表 classList = [] # 没有去重的所有词表 fullText = [] # 选择其中最短的预料长度 minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): # 把摘要分割成单词list集合 wordList = textParse(feed1['entries'][i]['summary']) # 添加每篇文档的词集合 docList.append(wordList) # 保存所有的词成一个list集合 fullText.extend(wordList) # 添加类别 classList.append(1) # 如上一样 wordList = textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 创建不重复所有文档使用的单词列表 vocabList = createVocabList(docList) # 选择词频最高的前30条数据 返回的格式是[({词:出现的次数})({词:出现的次数})....] top30Words = calcMostFreq(vocabList, fullText) # 移除频数最高的前30条数据 for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) # 移除停用词 rmwords = load_the_remov_words() for pairW in rmwords: if pairW in vocabList: vocabList.remove(pairW) # 训练数据集中数据的个数 trainingSet = list(range(2 * minLen)) # 测试数据集 testSet = [] # 任意选择二十条数据作为测试数据 for i in range(20): # 产生一个随机数 randIndex = int(np.random.uniform(0, len(trainingSet))) # 添加到测试集合上 testSet.append(trainingSet[randIndex]) # 删除测试数据索引 del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # 获得训练数据相应的词向量 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) # 添加相应的类别 trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is:', float(errorCount) / len(testSet)) return vocabList, p0V, p1V # 按顺序输出 满足一定阈值词 def getTopWords(ny, sf): vocabList, p0V, p1V = localWords(ny, sf) topNY = []; topSF = [] for i in range(len(p0V)): if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i])) if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i])) sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) print("SF**" * 14) for item in sortedSF: print(item[0]) sortedNF = sorted(topNY, key=lambda pair: pair[1], reverse=True) print("NF**" * 14) for item in sortedNF: print(item[0]) ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') getTopWords(ny, sf) # vocabList, pSF, pNY = localWords(ny, sf) # vocabList, pSF, pNY = localWords(ny, sf) # print(vocabList) # print("-" * 20) # print(pSF) # print('------------') # print(pNY)
停用词文件