【机器学习】周志华西瓜书第七章贝叶斯分类器习题--实现拉普拉斯修正的朴素贝叶斯分类器,以西瓜数据集3.0为训练集,对“测1”进行判别

watermelon_4.3.xlsx

4d36c53e6c404d308302f5c00892938a.png

 


from numpy import *
import numpy as np
import pandas as pd

#读取文件格式为xlsx的数据
def dataLoad(filename):
	df = pd.read_excel(fliename, header = None)
	propLabelSet = df.values[0:1, 1:-1] #属性列表 ['色泽' '根蒂' '敲声' '纹理' '脐部' '触感' '密度' '含糖率']
	dataSet = df.values[1:,1:-1] #样本数据
	labelSet = df.values[1:,-1:] #标签数据
	return propLabelSet, dataSet, labelSet

# #计算训练样本加上预测样本后,每个属性类别的个数,并保存在一个列表中用于在函数trainNB()中进行拉普拉斯修正
def getPropNum(TTdataSet):
	n = shape(TTdataSet)[1]  # 8种属性

	newPropNum = zeros(n) # 每种属性有几个取值
	for i in range(n):
		propCate = list(set(TTdataSet[:,i]))
		newPropNum[i] = len(propCate)
	return (newPropNum)


#训练朴素贝叶斯,将每个属性标签的后验概率存储在模型字典里,对测试样本分类时可直接调用
def trainNB(propLabelSet, trainSet, allLabelSet, numDataProp):
	numTrain, numProp = shape(trainSet) # 17,8
	modelDict = {}  # 创建保存模型的字典
	priorProb = {}  # 创建用于保存先验概率的字典
	numYandN = {}
	tempCate = allLabelSet.flatten()  # ['是' '是' '是' '是' '是' '是' '是' '是' '否' '否' '否' '否' '否' '否' '否' '否' '否']
	labelSet = list(set(tempCate))  # ['是', '否']
	N = len(labelSet) # 2
	for i in range(0,len(labelSet)):
		priorProb[labelSet[i]] = float((tempCate.tolist().count(labelSet[i]) + 1)/(numTrain + N))
		#统计好、坏瓜个数并计算其先验概率,写入字典priorProb中。这里count无法对array进行计数,因此只能转为list
		numYandN[labelSet[i]] = tempCate.tolist().count(labelSet[i])  # {'是': 8, '否': 9}
	for i in range(numProp):  # 对每一种属性
		discretePropDict = {}  # 存储每个离散属性标签的字典
		# 属性标签为离散型时
		if type(trainSet[0][i]).__name__ =='str':
			Ni = numDataProp[i] # numDataProp:[ 3.  3.  3.  3.  3.  2. 17. 17.]
			discretePropSet = list(set(trainSet[:,i]))

			for item in discretePropSet: #对每个属性类别的分类结果进行计数
				Y_and_N = {}  #存储每个属性标签下已知好坏瓜后此标签的概率
				for result in labelSet:
					countAll = tempCate.tolist().count(result)
					countData = 0
					for j in range(numTrain): #遍历每个数据
						if (trainSet[j][i] == item) and (allLabelSet[j] == result):
							countData += 1
					Y_and_N[result] = float((countData + 1)/float(countAll + Ni))
				discretePropDict[item] = Y_and_N

			modelDict[propLabelSet[0][i]] = discretePropDict


		# 属性标签为连续性时,用极大似然估计求概率分布的均值和方差
		else:
			Y_and_N = {}  #存储每个属性下已知好坏瓜后此属性的均值或方差的大小
			for result in labelSet:
				exp_and_varroot = {}
				countAll = tempCate.tolist().count(result)
				expec = float(sum([trainSet[j,i] for j in range(numTrain) if allLabelSet[j] == result]))/countAll #求均值
				exp_and_varroot['均值'] = expec
				var = float(sum([float((trainSet[j,i]-expec))**2 for j in range(numTrain) if labelData[j] == result])/countAll) #求方差
				var_root = sqrt(var) #求标准差
				exp_and_varroot['标准差'] = var_root
				Y_and_N[result] = exp_and_varroot
			modelDict[propLabelSet[0][i]] = Y_and_N
	return modelDict, priorProb, numYandN

#计算连续型属性的概率
def calContinuiousProb(data, expec, varroot):
	return float((1/(sqrt(2*pi)*varroot))*exp(-(data - expec)**2/(2*varroot**2)))


#对测试数据进行预测
def testDataPredict(testSet, trainModel, priorProba, propLabel, numDataPro, numYandN):
	#priorProba:{'否': 0.5263157894736842, '是': 0.47368421052631576}
	#propLabel:[['色泽' '根蒂' '敲声' '纹理' '脐部' '触感' '密度' '含糖率']]
	print(trainModel)
	m, n = shape(testSet)
	for i in range(m):
		prob = {}
		for item in priorProba.keys():
			prob[item] = 1.0*priorProba[item]
			for j in range(n):
				if type(testSet[0][j]).__name__ != 'str':
					prob[item] *= calContinuiousProb(testSet[i][j], trainModel[propLabel[0][j]][item]['均值'], trainModel[propLabel[0][j]][item]['标准差'])
				else:
					if testSet[i][j] in trainModel[propLabel[0][j]].keys(): #如果这种属性取值出现过
						prob[item] *= trainModel[propLabel[0][j]][testSet[i][j]][item]
					else:
						print('%s不在训练样本的%s属性中出现过,是新的属性类别' % (testSet[i][j], propLabel[0][j]))
						prob[item] *= float(1/(numYandN[item] + numDataPro[j]))
		print(prob)
		if prob['是'] >prob['否']:
			print('测试样本%d是好瓜' %(i))
		else:
			print('测试样本%d是坏瓜' %(i))

if __name__=="__main__": 
	fliename = 'watermelon_4.3.xlsx'
	propLabel, trainData, labelData = dataLoad(fliename)
	df = pd.read_excel('watermelon_4.3.xlsx')
	testData = df.values[:,1:-1]
	TTdata = np.concatenate((trainData,testData))
	numPropData = getPropNum(TTdata)
	modelDict, priorProb, numYesandNo = trainNB(propLabel, trainData, labelData, numPropData)
	testDataPredict(testData, modelDict, priorProb, propLabel, numPropData, numYesandNo)

 

 

  • 1
    点赞
  • 23
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
好的,以下是使用西瓜数据集3.0实现朴素贝叶斯分类器的Python代码: ```python import pandas as pd import numpy as np # 读取西瓜数据集3.0 watermelon_data = pd.read_csv('watermelon_3.csv') # 定义朴素贝叶斯分类器 class NaiveBayes: def __init__(self): self.p_y = {} # 存储每个类别的先验概率 self.p_xy = {} # 存储每个特征在每个类别下的条件概率 def fit(self, X, y): n_samples, n_features = X.shape classes = np.unique(y) n_classes = len(classes) # 计算每个类别的先验概率 for c in classes: self.p_y[c] = np.sum(y == c) / n_samples # 计算每个特征在每个类别下的条件概率 for f in range(n_features): for c in classes: X_c = X[y == c] self.p_xy[(f, c)] = (X_c[:, f].mean(), X_c[:, f].std()) def predict(self, X): y_pred = np.zeros(X.shape[0]) for i, x in enumerate(X): max_posterior = -1 max_class = -1 for c in self.p_y: # 计算后验概率 posterior = self.p_y[c] for f, value in enumerate(x): mean, std = self.p_xy[(f, c)] likelihood = self.gaussian_pdf(value, mean, std) posterior *= likelihood # 更新最大后验概率和对应的类别 if posterior > max_posterior: max_posterior = posterior max_class = c y_pred[i] = max_class return y_pred # 高斯分布概率密度函数 def gaussian_pdf(self, x, mean, std): exponent = np.exp(-((x-mean)**2 / (2 * std**2))) return (1 / (np.sqrt(2 * np.pi) * std)) * exponent # 将数据集拆分为训练集试集 X = watermelon_data.iloc[:, 1:-1].values y = watermelon_data.iloc[:, -1].values indices = np.random.permutation(len(X)) train_indices, test_indices = indices[:int(0.7*len(X))], indices[int(0.7*len(X)):] X_train, y_train = X[train_indices], y[train_indices] X_test, y_test = X[test_indices], y[test_indices] # 训练朴素贝叶斯分类器 nb = NaiveBayes() nb.fit(X_train, y_train) y_pred = nb.predict(X_test) # 输出预结果 print(y_pred) ``` 需要注意的是,这里使用了高斯分布来估计每个特征在每个类别下的条件概率。如果数据集中存在离散特征,可以使用多项式分布或伯努利分布来进行估计。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

WiIsonEdwards

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值