import numpy as np
import pandas as pd
import random
import os
import scipy.stats
os.chdir(r'D:\Python 练习')
dataSet = pd.read_csv('iris.csv')
def randSplit(dataSet,rate):
l=list(dataSet.index)
random.seed( 2 )
random.shuffle(l)
dataSet.index=l
n=dataSet.shape[0]
m=int(n*rate)
train=dataSet.loc[range(m),:]
test=dataSet.loc[range(m,n),:]
dataSet.index=range(dataSet.shape[0])
test.index=range(test.shape[0])
return train,test
train,test=randSplit(dataSet,0.8)
def gnb_classify(train,test):
labels = train.iloc[:,-1].value_counts().index
mean=[]
std=[]
result=[]
for i in labels:
item=train.loc[train.iloc[:,-1]==i,:]
#print('item',item)
m=item.iloc[:,:-1].mean() #均值
s=np.sum((item.iloc[:,:-1]-m)**2)/(item.shape[0]) #方差
mean.append(m)
std.append(s)
means=pd.DataFrame(mean,index=labels)
stds=pd.DataFrame(std,index=labels)
print('means',means)
print('stds',stds)
for j in range(test.shape[0]):
iset=test.iloc[j,:-1].tolist()
#print('iset',iset)
#iprob=scipy.stats.norm(means,np.sqrt(stds)).pdf(iset)
#iprob=pd.DataFrame(iprob,index=labels)
iprob=np.exp(-1*(iset-means)**2/(stds*2))/(np.sqrt(2*np.pi*stds)) #正态分布
#print('iprob',iprob)
prob=1
for k in dataSet.columns[:-1]: #dataSet.columns[:-1]
#print('iprob[k]',iprob[k])
prob *=iprob[k]
#print('prob',prob)
cla=prob.index[np.argmax(prob.values)]
#print('cla',cla)
result.append(cla)
test['predict']=result
acc =(test.iloc[:,-1]==test.iloc[:,-2]).mean()
print(f'模型预测准确率为{acc}')
return test
gnb_classify(train,test)