from numpy import *
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
#导入iris数据集,特征选择二维
iris=load_iris()
data=iris.data[:,:2]
label=iris.target
#画原始数据分类图
index_0=where(label==0)#索引值
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color='r',label='0',s=25)#s点的大小
index_1=where(label==1)#索引值
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color='b',label='1',s=25)
index_2=where(label==2)#索引值
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color='g',label='2',s=25)
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc='upper left')
plt.show()
输出结果:
x_train,x_test,y_train,y_test=train_test_split(data,label,test_size=0.2,random_state=1)#训练集,测试集
'''k-近邻算法的交叉验证'''
folds=4 #4折交叉验证
k_choices=[1,3,5,7,9,11,13,15,17,21,23,25]#k近邻
X_folds=[]
Y_folds=[]
X_folds=vsplit(x_train,folds)#将x_train纵向等分为4个片段
Y_folds=hsplit(y_train,folds)#将y_train横向等分为4个片段
accuracy_of_k={}#字典,储存不同k的准确率
for k in k_choices:
accuracy_of_k[k]=[]#每个k的准确率
#交叉验证计算准确率
for i in range(folds):
X_train=vstack(X_folds[:i]+X_folds[i+1:])#交叉验证中的训练集
X_val=X_folds[i]#交叉验证中的测试集
Y_train=hstack(Y_folds[:i]+Y_folds[i+1:])#交叉验证中的训练集
Y_val=Y_folds[i]#交叉验证中的测试集
#print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)
for k in k_choices:
knn=KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,Y_train)
Y_val_pred=knn.predict(X_val)
accuracy=mean(Y_val_pred==Y_val)#准确率
accuracy_of_k[k].append(accuracy)#对应的k中加入准确率
for k in sorted(k_choices):
for accuracy in accuracy_of_k[k]:
print("k=%d,accuracy=%f"%(k,accuracy))
输出结果:
k=1,accuracy=0.700000
k=1,accuracy=0.766667
k=1,accuracy=0.666667
k=1,accuracy=0.800000
k=3,accuracy=0.600000
k=3,accuracy=0.766667
k=3,accuracy=0.766667
k=3,accuracy=0.733333
k=5,accuracy=0.600000
k=5,accuracy=0.766667
k=5,accuracy=0.733333
k=5,accuracy=0.800000
k=7,accuracy=0.666667
k=7,accuracy=0.766667
k=7,accuracy=0.800000
k=7,accuracy=0.666667
k=9,accuracy=0.766667
k=9,accuracy=0.766667
k=9,accuracy=0.766667
k=9,accuracy=0.766667
k=11,accuracy=0.833333
k=11,accuracy=0.800000
k=11,accuracy=0.766667
k=11,accuracy=0.733333
k=13,accuracy=0.800000
k=13,accuracy=0.833333
k=13,accuracy=0.766667
k=13,accuracy=0.733333
k=15,accuracy=0.800000
k=15,accuracy=0.766667
k=15,accuracy=0.766667
k=15,accuracy=0.733333
k=17,accuracy=0.800000
k=17,accuracy=0.833333
k=17,accuracy=0.766667
k=17,accuracy=0.733333
k=21,accuracy=0.800000
k=21,accuracy=0.800000
k=21,accuracy=0.766667
k=21,accuracy=0.700000
k=23,accuracy=0.800000
k=23,accuracy=0.833333
k=23,accuracy=0.733333
k=23,accuracy=0.733333
k=25,accuracy=0.800000
k=25,accuracy=0.833333
k=25,accuracy=0.733333
k=25,accuracy=0.733333
#交叉验证的准确度可视化
for k in k_choices:
plt.scatter([k]*len(accuracy_of_k[k]),accuracy_of_k[k])
#误差棒图
accuracies_mean=array([mean(v) for k,v in sorted(accuracy_of_k.items())])
accuracies_std=array([std(v) for k,v in sorted(accuracy_of_k.items())])
plt.errorbar(k_choices,accuracies_mean,accuracies_std)
plt.xlabel('k')
plt.ylabel('accuracy')
plt.show()
输出结果:
'''在交叉验证中,最优的k值为13'''
best_k=13
knn=KNeighborsClassifier(n_neighbors=best_k)
knn.fit(x_train,y_train)
y_test_pred=knn.predict(x_test)
correctNum=sum(y_teat_pred==y_test)
accuracyRate=mean(y_teat_pred==y_test)
print("正确的个数为:%d,准确率为:%f"%(correctNum,accuracyRate))
输出结果:
正确的个数为:24,准确率为:0.800000