分类问题 Python sklearn库 k-近邻算法交叉验证(学习笔记)

from numpy import *
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

#导入iris数据集,特征选择二维
iris=load_iris()
data=iris.data[:,:2]
label=iris.target 

#画原始数据分类图
index_0=where(label==0)#索引值
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color='r',label='0',s=25)#s点的大小
index_1=where(label==1)#索引值
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color='b',label='1',s=25)
index_2=where(label==2)#索引值
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color='g',label='2',s=25)
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc='upper left')
plt.show()

输出结果:

在这里插入图片描述

x_train,x_test,y_train,y_test=train_test_split(data,label,test_size=0.2,random_state=1)#训练集,测试集

'''k-近邻算法的交叉验证'''
folds=4 #4折交叉验证
k_choices=[1,3,5,7,9,11,13,15,17,21,23,25]#k近邻
X_folds=[]
Y_folds=[]
X_folds=vsplit(x_train,folds)#将x_train纵向等分为4个片段
Y_folds=hsplit(y_train,folds)#将y_train横向等分为4个片段

accuracy_of_k={}#字典,储存不同k的准确率
for k in k_choices:
    accuracy_of_k[k]=[]#每个k的准确率

#交叉验证计算准确率
for i in range(folds):
    X_train=vstack(X_folds[:i]+X_folds[i+1:])#交叉验证中的训练集
    X_val=X_folds[i]#交叉验证中的测试集
    Y_train=hstack(Y_folds[:i]+Y_folds[i+1:])#交叉验证中的训练集
    Y_val=Y_folds[i]#交叉验证中的测试集
    #print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)
    
    for k in k_choices:
        knn=KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train,Y_train)
        Y_val_pred=knn.predict(X_val)
        accuracy=mean(Y_val_pred==Y_val)#准确率
        accuracy_of_k[k].append(accuracy)#对应的k中加入准确率
    
for k in sorted(k_choices):
    for accuracy in accuracy_of_k[k]:
        print("k=%d,accuracy=%f"%(k,accuracy))

输出结果:

k=1,accuracy=0.700000
k=1,accuracy=0.766667
k=1,accuracy=0.666667
k=1,accuracy=0.800000
k=3,accuracy=0.600000
k=3,accuracy=0.766667
k=3,accuracy=0.766667
k=3,accuracy=0.733333
k=5,accuracy=0.600000
k=5,accuracy=0.766667
k=5,accuracy=0.733333
k=5,accuracy=0.800000
k=7,accuracy=0.666667
k=7,accuracy=0.766667
k=7,accuracy=0.800000
k=7,accuracy=0.666667
k=9,accuracy=0.766667
k=9,accuracy=0.766667
k=9,accuracy=0.766667
k=9,accuracy=0.766667
k=11,accuracy=0.833333
k=11,accuracy=0.800000
k=11,accuracy=0.766667
k=11,accuracy=0.733333
k=13,accuracy=0.800000
k=13,accuracy=0.833333
k=13,accuracy=0.766667
k=13,accuracy=0.733333
k=15,accuracy=0.800000
k=15,accuracy=0.766667
k=15,accuracy=0.766667
k=15,accuracy=0.733333
k=17,accuracy=0.800000
k=17,accuracy=0.833333
k=17,accuracy=0.766667
k=17,accuracy=0.733333
k=21,accuracy=0.800000
k=21,accuracy=0.800000
k=21,accuracy=0.766667
k=21,accuracy=0.700000
k=23,accuracy=0.800000
k=23,accuracy=0.833333
k=23,accuracy=0.733333
k=23,accuracy=0.733333
k=25,accuracy=0.800000
k=25,accuracy=0.833333
k=25,accuracy=0.733333
k=25,accuracy=0.733333

#交叉验证的准确度可视化
for k in k_choices:
    plt.scatter([k]*len(accuracy_of_k[k]),accuracy_of_k[k])

#误差棒图
accuracies_mean=array([mean(v) for k,v in sorted(accuracy_of_k.items())])
accuracies_std=array([std(v) for k,v in sorted(accuracy_of_k.items())])
plt.errorbar(k_choices,accuracies_mean,accuracies_std)
plt.xlabel('k')
plt.ylabel('accuracy')
plt.show()

输出结果:

在这里插入图片描述

'''在交叉验证中,最优的k值为13'''
best_k=13
knn=KNeighborsClassifier(n_neighbors=best_k)
knn.fit(x_train,y_train)
y_test_pred=knn.predict(x_test)
correctNum=sum(y_teat_pred==y_test)
accuracyRate=mean(y_teat_pred==y_test)
print("正确的个数为:%d,准确率为:%f"%(correctNum,accuracyRate))

输出结果:

正确的个数为:24,准确率为:0.800000

  • 1
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值