今天会用朴素贝叶斯对上一篇文章提到的乳腺癌数据进行训练并预测。
关于贝叶斯原理这里就不介绍了,网上有很多。这里说一下朴素一词的含义是,数据中的特征都是互相独立的。
现在来介绍一下针对该数据集的计算方法。对于一条数据,假设其有m个特征,n种可能的分类,我们要计算的是在在这些特征取相应值的情况下,这条数据分为某一类的概率,即
根据贝叶斯理论,可知:,
又因为特征相互独立,所以上式可以简化为,
根据这个公式,我们需要知道,那么再用一次贝叶斯公式:
所以,我们一共是用了两次贝叶斯公式。理论部分说完了,接下来直接上代码:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
features = ['class','age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
df = pd.read_csv('breast-cancer.data',names=features)
le = preprocessing.LabelEncoder()
for f in features:
le.fit(df[f])
df[f] = le.transform(df[f])
x = df.iloc[:,1:]
y = df.iloc[:,0]
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.2,random_state=5,stratify=y)
numOfClass0 = train_y.value_counts()[0]
numOfClass1 = train_y.value_counts()[1]
probOfClass0 = numOfClass0 / (numOfClass0 + numOfClass1)
probOfClass1 = numOfClass1 / (numOfClass0 + numOfClass1)
def calcConditionalProb(dataSet,dataLabels,dataFeatures):
dataSize = len(dataLabels)
condiProbOnClass0 = {}
condiProbOnClass1 = {}
for f in dataFeatures[1:]:
col = dataSet.loc[:,f]
featValues = sorted(col.value_counts().index)
countOfClass0 = np.ones(featValues[-1]+1)
countOfClass1 = np.ones(featValues[-1]+1)
for index in col.index:
if dataLabels[index] == 0:
countOfClass0[col[index]] += 1
else:
countOfClass1[col[index]] += 1
countOfFeatVal = countOfClass0 + countOfClass1
probOfFeatVal = countOfFeatVal / dataSize #p(f_k=v_k)
condiProbOfClass0OnFeatVal = countOfClass0 / countOfFeatVal #p(c_0|f_k=v_k)
condiProbOfClass1OnFeatVal = countOfClass1 / countOfFeatVal
denominatorOfClass0 = np.sum(condiProbOfClass0OnFeatVal*probOfFeatVal)
denominatorOfClass1 = np.sum(condiProbOfClass1OnFeatVal*probOfFeatVal)
condiProbOfFeatValOnClass0 = condiProbOfClass0OnFeatVal*probOfFeatVal / denominatorOfClass0 #p(f_k=v_k|c_0)
condiProbOfFeatValOnClass1 = condiProbOfClass1OnFeatVal*probOfFeatVal / denominatorOfClass1
condiProbOnClass0[f] = condiProbOfFeatValOnClass0
condiProbOnClass1[f] = condiProbOfFeatValOnClass1
return condiProbOnClass0,condiProbOnClass1
def classify(testData,condiProbOnClass0,condiProbOnClass1):
dataFeatures = testData.index
molecular0 = 1.0
molecular1 = 1.0
for f in dataFeatures:
featVal = testData[f]
molecular0 *= condiProbOnClass0[f][featVal]
molecular1 *= condiProbOnClass1[f][featVal]
molecular0 *= probOfClass0
molecular1 *= probOfClass1
if molecular0 > molecular1:
return 0
else:
return 1
def test(train_x,train_y,test_x,test_y,dataFeatures):
condiProbOnClass0,condiProbOnClass1=calcConditionalProb(train_x,train_y,dataFeatures)
rightNum = 0
for i in range(len(test_x)):
if classify(test_x.iloc[i,:],condiProbOnClass0,condiProbOnClass1)==train_y.iloc[i,]:
rightNum += 1
return rightNum / len(test_x)
print(test(train_x,train_y,test_x,test_y,features))