一 开发环境:win7 64 位 + Pycharm5.0 + python3.4.4
二 工具包:numpy + matplotlib (对于microsoft visual c++ 10.0 is required错误,我是通过下载microsoft visual c++ 10.0解决的)
三 参考书籍:机器学习实战
朴素贝叶斯:
优点:在数据较小的情况下仍然有效,可以处理多类别问题
缺点:对于输入数据的准备方式比较敏感
使用数据类型:标称型数据
一般流程:
1)数据收集:可以使用任何方法;
2)准备数据:需要数值型或布尔型数据;
3)分析数据:有大量特征时,绘制特征作用不大,此时使用直方图效果更好;
4)训练数据:计算不同的独立特征的条件概率;
5)测试算法:计算错误率;
6)使用方法:常见用于文本分类。
四 程序清单:
1.准备数据阶段:
# 生成所有文本中出现的不重复词的列表,dataSet:词表向量 def createVocabList(dataSet): vocabSet = set([]) for document in dataSet: vocabSet = vocabSet | set(document) return list(vocabSet)
# 词集模式:将文档转换为文档向量,vocabList:文本单词列表;inputSet:待数字化的词向量 def setOfwords2Vec(vocabList, inputSet): returnVec = [0]*len(vocabList) # 创建返回的文档向量 for word in vocabList: if word in inputSet: # 若word在inputSet中,将文档向量对应位置值设为1 returnVec[vocabList.index(word)] = 1 return returnVec # 词袋模式:将文档转换为文档向量,vocabList:文本单词列表;inputSet:待数字化的词向量 def bagOfwords2Vec(vocabList, inputSet): returnVec = [0]*len(vocabList) # 创建返回的文档向量 for word in vocabList: if word in inputSet: # 若word在inputSet中,将文档向量对应位置值设为1 index = vocabList.index(word) returnVec[index] += 1 return returnVec2.训练数据阶段:
# 朴素贝叶斯分类器训练函数,trainMatrix:训练集合;trainCategory:分类列表 def trainNB0(trainMatrix, trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pAbusive = sum(trainCategory)/float(numTrainDocs) # 计算trainCategory=1的概率 # 初始化概率向量 p0Num = ones(numWords) p1Num = ones(numWords) p0Denom = 2.0 p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: # trainCategory[i]类别为1,记录此时每个单词出现的次数和总的单词出现次数 p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) # 计算每个单词在trainCategory[i]类别为1条件下出现的概率 p0Vect = log(p0Num/p0Denom) # 计算每个单词在trainCategory[i]类别为0条件下出现的概率 return p0Vect, p1Vect, pAbusive
# 朴素贝叶斯分类函数,vec2Classify:数字化后的词向量,p0Vec, p1Vec, pClass1:贝叶斯分类器训练结果 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1 - pClass1) if p1 > p0: return 1 else: return 03.算法测试-包括三个测试用例
1)简单输入测试:
def testingNB(): listOposts, listClass = loadDataSet() myVocabList = createVocabList(listOposts) trainMat = [] for postinDoc in listOposts: trainMat.append(setOfwords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = trainNB0(trainMat, listClass) testEntry = ['love', 'my', 'dalmation'] thisDoc = setOfwords2Vec(myVocabList, testEntry) print(testEntry, ' classify as:', classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] thisDoc = setOfwords2Vec(myVocabList, testEntry) print(testEntry, ' classify as:', classifyNB(thisDoc, p0V, p1V, pAb))2)垃圾邮件检测
# 文本解析 def textParse(bigString): listOfTokens = re.split(r'\W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2]
# 垃圾邮件测试函数 def spamtext(): docList = [] classList = [] fullText = [] for i in range(1, 26): # 生成测试数据 wordList = textParse(open('email/spam/{}.txt'.format(i)).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/{}.txt'.format(i)).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) # 生成单词表 trainingSet = list(range(50)) # 创建训练集 testSet = [] for i in range(10): # 从训练集中随机选取10个作为测试集 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) # 添加到测试集中 del(trainingSet[randIndex]) # 从训练集中移除 trainMat = [] # 训练矩阵(仅包含0,1) trainClasses = [] # 训练矩阵类别列表 for docIndex in trainingSet: # 训练集数据转换为训练矩阵 trainMat.append(setOfwords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pAb = trainNB0(trainMat, trainClasses) errorCount = 0 for docIndex in testSet: wordVector = setOfwords2Vec(vocabList, docList[docIndex]) if classifyNB(wordVector, p0V, p1V, pAb) != classList[docIndex]: errorCount += 1 print('the error rate is:', float(errorCount)/len(testSet))3)从个人广告中获取区域倾向
# RSS源分类器及高频词去除函数 def calcWords(vocabList, fullText): freqDict = {} for token in vocabList: freqDict[token] = fullText.count(token) sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) return sortedFreq[:60]文章中去除最高的30个单词,测试中发现错误率高达50%,对于两选一的结果和瞎蒙一样,增加到60后能减少到30%
def localWords(feed1, feed0): docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): # 从RSS源获取数据生成数据集 wordList = textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) top60Words = calcWords(vocabList, fullText) # 获取高频词,一般来说是停词 for pairW in top60Words: # 去除高频词 if pairW in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2*minLen)) testSet = [] for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClass = [] for docIndex in trainingSet: trainMat.append(bagOfwords2Vec(vocabList, docList[docIndex])) trainClass.append(classList[docIndex]) p0V, p1V, pAb = trainNB0(trainMat, trainClass) errorCount = 0 for docIndex in testSet: wordVector = bagOfwords2Vec(vocabList, docList[docIndex]) if classifyNB(wordVector, p0V, p1V, pAb) != classList[docIndex]: errorCount += 1 print('the error rate is:', float(errorCount)/len(testSet)) return vocabList, p0V, p1V五 数据集和源码:
http://pan.baidu.com/s/1boyIE4J