机器学习-朴素贝叶斯

一 开发环境:win7 64 位 + Pycharm5.0 + python3.4.4


二 工具包:numpy + matplotlib (对于microsoft visual c++ 10.0 is required错误,我是通过下载microsoft visual c++ 10.0解决的)


三 参考书籍:机器学习实战

朴素贝叶斯:

优点:在数据较小的情况下仍然有效,可以处理多类别问题

缺点:对于输入数据的准备方式比较敏感

使用数据类型:标称型数据

一般流程:

1)数据收集:可以使用任何方法;

2)准备数据:需要数值型或布尔型数据;

3)分析数据:有大量特征时,绘制特征作用不大,此时使用直方图效果更好;

4)训练数据:计算不同的独立特征的条件概率;

5)测试算法:计算错误率;

6)使用方法:常见用于文本分类。

四 程序清单:

1.准备数据阶段:

# 生成所有文本中出现的不重复词的列表,dataSet:词表向量
def createVocabList(dataSet):
   vocabSet = set([])
   for document in dataSet:
      vocabSet = vocabSet | set(document)
   return list(vocabSet)
# 词集模式:将文档转换为文档向量,vocabList:文本单词列表;inputSet:待数字化的词向量
def setOfwords2Vec(vocabList, inputSet):
   returnVec = [0]*len(vocabList)  # 创建返回的文档向量
   for word in vocabList:
      if word in inputSet:  # 若word在inputSet中,将文档向量对应位置值设为1
         returnVec[vocabList.index(word)] = 1
   return returnVec


# 词袋模式:将文档转换为文档向量,vocabList:文本单词列表;inputSet:待数字化的词向量
def bagOfwords2Vec(vocabList, inputSet):
   returnVec = [0]*len(vocabList)  # 创建返回的文档向量
   for word in vocabList:
      if word in inputSet:  # 若word在inputSet中,将文档向量对应位置值设为1
         index = vocabList.index(word)
         returnVec[index] += 1
   return returnVec
2.训练数据阶段:

# 朴素贝叶斯分类器训练函数,trainMatrix:训练集合;trainCategory:分类列表
def trainNB0(trainMatrix, trainCategory):
   numTrainDocs = len(trainMatrix)
   numWords = len(trainMatrix[0])
   pAbusive = sum(trainCategory)/float(numTrainDocs)  # 计算trainCategory=1的概率

   # 初始化概率向量
   p0Num = ones(numWords)
   p1Num = ones(numWords)
   p0Denom = 2.0
   p1Denom = 2.0

   for i in range(numTrainDocs):
      if trainCategory[i] == 1:  # trainCategory[i]类别为1,记录此时每个单词出现的次数和总的单词出现次数
         p1Num += trainMatrix[i]
         p1Denom += sum(trainMatrix[i])
      else:
         p0Num += trainMatrix[i]
         p0Denom += sum(trainMatrix[i])
   p1Vect = log(p1Num/p1Denom)  # 计算每个单词在trainCategory[i]类别为1条件下出现的概率
   p0Vect = log(p0Num/p0Denom)  # 计算每个单词在trainCategory[i]类别为0条件下出现的概率
   return p0Vect, p1Vect, pAbusive
# 朴素贝叶斯分类函数,vec2Classify:数字化后的词向量,p0Vec, p1Vec, pClass1:贝叶斯分类器训练结果
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
   p1 = sum(vec2Classify * p1Vec) + log(pClass1)
   p0 = sum(vec2Classify * p0Vec) + log(1 - pClass1)

   if p1 > p0:
      return 1
   else:
      return 0
3.算法测试-包括三个测试用例

1)简单输入测试:

def testingNB():
   listOposts, listClass = loadDataSet()
   myVocabList = createVocabList(listOposts)
   trainMat = []
   for postinDoc in listOposts:
      trainMat.append(setOfwords2Vec(myVocabList, postinDoc))
   p0V, p1V, pAb = trainNB0(trainMat, listClass)
   testEntry = ['love', 'my', 'dalmation']
   thisDoc = setOfwords2Vec(myVocabList, testEntry)
   print(testEntry, ' classify as:', classifyNB(thisDoc, p0V, p1V, pAb))
   testEntry = ['stupid', 'garbage']
   thisDoc = setOfwords2Vec(myVocabList, testEntry)
   print(testEntry, ' classify as:', classifyNB(thisDoc, p0V, p1V, pAb))
2)垃圾邮件检测

# 文本解析
def textParse(bigString):
   listOfTokens = re.split(r'\W*', bigString)
   return [tok.lower() for tok in listOfTokens if len(tok) > 2]
# 垃圾邮件测试函数
def spamtext():
   docList = []
   classList = []
   fullText = []

   for i in range(1, 26):  # 生成测试数据
      wordList = textParse(open('email/spam/{}.txt'.format(i)).read())
      docList.append(wordList)
      fullText.extend(wordList)
      classList.append(1)
      wordList = textParse(open('email/ham/{}.txt'.format(i)).read())
      docList.append(wordList)
      fullText.extend(wordList)
      classList.append(0)
   vocabList = createVocabList(docList)  # 生成单词表
   trainingSet = list(range(50))  # 创建训练集
   testSet = []
   for i in range(10):  # 从训练集中随机选取10个作为测试集
      randIndex = int(random.uniform(0, len(trainingSet)))
      testSet.append(trainingSet[randIndex])  # 添加到测试集中
      del(trainingSet[randIndex])  # 从训练集中移除
   trainMat = []  # 训练矩阵(仅包含0,1)
   trainClasses = []  # 训练矩阵类别列表
   for docIndex in trainingSet:  # 训练集数据转换为训练矩阵
      trainMat.append(setOfwords2Vec(vocabList, docList[docIndex]))
      trainClasses.append(classList[docIndex])
   p0V, p1V, pAb = trainNB0(trainMat, trainClasses)
   errorCount = 0
   for docIndex in testSet:
      wordVector = setOfwords2Vec(vocabList, docList[docIndex])
      if classifyNB(wordVector, p0V, p1V, pAb) != classList[docIndex]:
         errorCount += 1

   print('the error rate is:', float(errorCount)/len(testSet))
3)从个人广告中获取区域倾向

# RSS源分类器及高频词去除函数
def calcWords(vocabList, fullText):
   freqDict = {}
   for token in vocabList:
      freqDict[token] = fullText.count(token)
   sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
   return sortedFreq[:60]
文章中去除最高的30个单词,测试中发现错误率高达50%,对于两选一的结果和瞎蒙一样,增加到60后能减少到30%

def localWords(feed1, feed0):
   docList = []
   classList = []
   fullText = []
   minLen = min(len(feed1['entries']), len(feed0['entries']))
   for i in range(minLen):  # 从RSS源获取数据生成数据集
      wordList = textParse(feed1['entries'][i]['summary'])
      docList.append(wordList)
      fullText.extend(wordList)
      classList.append(1)
      wordList = textParse(feed0['entries'][i]['summary'])
      docList.append(wordList)
      fullText.extend(wordList)
      classList.append(0)
   vocabList = createVocabList(docList)
   top60Words = calcWords(vocabList, fullText)  # 获取高频词,一般来说是停词
   for pairW in top60Words:  # 去除高频词
      if pairW in vocabList:
         vocabList.remove(pairW[0])
   trainingSet = list(range(2*minLen))
   testSet = []
   for i in range(20):
      randIndex = int(random.uniform(0, len(trainingSet)))
      testSet.append(trainingSet[randIndex])
      del(trainingSet[randIndex])
   trainMat = []
   trainClass = []
   for docIndex in trainingSet:
      trainMat.append(bagOfwords2Vec(vocabList, docList[docIndex]))
      trainClass.append(classList[docIndex])
   p0V, p1V, pAb = trainNB0(trainMat, trainClass)
   errorCount = 0
   for docIndex in testSet:
      wordVector = bagOfwords2Vec(vocabList, docList[docIndex])
      if classifyNB(wordVector, p0V, p1V, pAb) != classList[docIndex]:
         errorCount += 1
   print('the error rate is:', float(errorCount)/len(testSet))
   return vocabList, p0V, p1V
五 数据集和源码:

http://pan.baidu.com/s/1boyIE4J

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值