python程序
# navie bayes
import numpy as np
# 载入数据
def loadDataSet():
dataMat = []; labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
temp = []
lineArr = line.strip().split( ) # 逐行读入数据,然后strip去头去尾,用split分组
for i in range(len(lineArr) - 2): # 将数据按行读入,最后一位为标签位,其他为需要判读的语句
temp.append(lineArr[i])
dataMat.append(temp) # 读入每行的语句
labelMat.append( int(lineArr[-1]) ) # 读入每行语句的标签
return dataMat,labelMat
# 创建词集模型(set-of-words)
def creatVocabList(dataSet): # 创建词向量,即将出现过得词都放在一个集合中(无重复)
vocabSet = set([]) # 创建空词向量
for document in dataSet:
vocabSet = vocabSet | set(document) # 集合的并运算
return list(vocabSet)
# 将语句转化为对应的词向量
def setOfWords2Vec(testList,myVocabVec): # 将语句转化为向量形式
Vec = [0] * len(myVocabVec) # 建立一个长度等于特征维度的向量,作为语句的词向量
for word in testList:
if word in myVocabVec:
Vec[myVocabVec.index(word)] = 1
else:
print('the word: %s is not in my Vocabulary!' %word)
return Vec
# 生成训练矩阵,将每条语句转化而成的词向量合为矩阵trainMatrix
def createTrainMatrix(trainListing,myVocabVec):
trainMatrix=[] # 训练矩阵
for i in range(len(trainListing)):
curVec = setOfWords2Vec(trainListing[i],myVocabVec)
trainMatrix.append(curVec)
return trainMatrix
# 训练朴素贝叶斯模型
def trainNB0(trainMatrix,tarinCategory):
numTrainDocs = len(trainMatrix) # 文档数量
numWords = len(trainMatrix[0]) # 样本特征数,这里等于构建的词向量的长度 ==len(myVocabVec)
pAbusive = sum(tarinCategory)/float(numTrainDocs) # 类别为1的文档数的占比,即p(1)
p0Num = np.ones(numWords); p1Num = np.ones(numWords) # 对于不同类别,建立单词统计矩阵,可以按位得到每个单词的数量
p0Denom = 2.0; p1Denom = 2.0 # 对于不同类别,统计总单词数
for i in range(numTrainDocs):
if tarinCategory[i] == 1:
p1Num += trainMatrix[i] # 对于类别1,按位得到每个单词的数量
p1Denom += sum(trainMatrix[i]) # 对于类别1,统计总单词数
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num/p1Denom # 向量除法,对应位相除,得到每个词在类别1下的概率,即 p(w0|c=1),p(w0|c=1).....
p0Vect = p0Num/p0Denom
p1Vect = np.log(p1Vect) # 取对数,之后的乘法就可以改为加法,防止数值下溢损失精度
p0Vect = np.log(p0Vect)
return p0Vect,p1Vect,pAbusive
# 朴素贝叶斯分类,得到测试用例为0还是1
def NBclassify(testDoc,myVocabVec,p0Vect,p1Vect,pClass1):
vec2Classify = setOfWords2Vec(testDoc,myVocabVec)
p1 = sum(vec2Classify * p1Vect) + np.log(pClass1)
p0 = sum(vec2Classify * p0Vect) + np.log(1 - pClass1)
print('p0=%s\n' % p0)
print('p1=%s\n' % p1)
if p1 > p0:
return 1
else:
return 0
##########################
listing,listclass = loadDataSet()
myVocabVec = creatVocabList(listing)
trainMatrix = createTrainMatrix(listing,myVocabVec)
p0Vect,p1Vect,pClass1 = trainNB0(trainMatrix,listclass)
testEntry0 = ['i','eat','a','lot','school']
testEntry1 = ['stupid','shit']
result0 = NBclassify(testEntry0,myVocabVec,p0Vect,p1Vect,pClass1)
result1 = NBclassify(testEntry1,myVocabVec,p0Vect,p1Vect,pClass1)
# python里面自制三目运算符
print('result0 is %s' %(['good','bad'][result0==1]))
print('result1 is %s' %(['good','bad'][result1==1]))
运行结果
参考
《机器学习实战》