import numpy as np
from numpy import array
import math
def loadDataSet(): #导入数据
postingList=[['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', ],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', ],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', ],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', ],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', ],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', ],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', ],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', ],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', ],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', ],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', ],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', ],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', ],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', ],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', ],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', ],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', ], ]
classVec = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
print(vocabSet)
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print ("the word: %s is not in my Vocabulary!" % word)
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = np.zeros(numWords)
p1Num = np.zeros(numWords)
p0Denom = 0.0; p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom+=1
else:
p0Num += trainMatrix[i]
p0Denom+=1
print(p1Num)
print(p1Denom)
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + math.log2(pClass1)
p0 = sum(vec2Classify * p0Vec) + math.log2(1-pClass1)
if(p1>p0):
return '好瓜'
if(p0>p1):
return '坏瓜'
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
print(trainMat)
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
print (p0V,p1V,pAb)
testEntry = ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))