kNN 算法学习与应用
导入库
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
%config InlineBackend.figure_format = 'svg'
from sklearn.preprocessing import StandardScaler # 导入数据预处理模块中的标准化模块
from sklearn.linear_model import LinearRegression # 导入线性分类器中的线性回归
from sklearn.model_selection import GridSearchCV # 导入模型选择中的网格搜索
from sklearn.metrics import accuracy_score # 导入模型评估标准中的准确率
1.kNN算法
k-Nearest Neighbor
找到k个最近距离的邻居
分类:少数服从多数,x1的类型与最多的邻居的类型一样
回归:x1的预测值是k个邻居标签的平均值
欧式距离:
d
=
(
x
1
2
−
x
2
2
)
+
(
y
1
2
−
y
2
2
)
2
d=\sqrt[2]{(x1^2-x2^2)+(y1^2-y2^2)}
d=2(x12−x22)+(y12−y22)
d
=
(
x
1
2
−
x
2
2
)
+
(
y
1
2
−
y
2
2
)
+
(
z
1
2
−
z
2
2
)
2
d=\sqrt[2]{(x1^2-x2^2)+(y1^2-y2^2)+(z1^2-z2^2)}
d=2(x12−x22)+(y12−y22)+(z12−z22)
回归算法:
def knn_R(X,y,X_sample,k):
d = np.sqrt(np.sum((X-X_sample) ** 2,axis=1))
knn=[*zip(d,y)]
knn.sort()
Y_sample = np.array(knn[:k])[:,1].mean()
return Y_sample
分类算法:
def knn_C(X,y,X_sample,k):
d = np.sqrt(np.sum((X-X_sample) ** 2,axis=1))
knn=[*zip(d,y)]
knn.sort()
Y_sample = pd.Series(np.array(knn[:k])[:,1]).mode()[0]
return Y_sample
应用:
制作样本集
from sklearn.datasets import make_blobs # 制作样本集
X,y = make_blobs(centers=4)
其中X:
(array([[ 8.36716588, 2.14903081],
[ 1.49071545, 1.16468017],
[-6.36293048, -3.30167076],
[ 6.18790626, 0.95230456],
[ 1.24846252, -1.97585254],
[ 2.2395423 , 2.91643504],
...
...
其中y:
array([3, 2, 1, 3, 0, 2, 0, 3, 1, 2, 3, 1, 0, 1, 0, 0, 1, 2, 1, 1, 2, 2,
1, 2, 2, 2, 1, 0, 1, 1, 3, 3, 2, 2, 2, 1, 0, 0, 1, 3, 2, 1, 3, 1,
0, 0, 2, 1, 3, 2, 2, 3, 2, 3, 0, 1, 0, 0, 2, 0, 3, 0, 3, 2, 0, 3,
0, 0, 3, 0, 3, 3, 0, 2, 1, 1, 2, 3, 3, 0, 3, 1, 1, 3, 2, 0, 3, 2,
1, 1, 3, 1, 2, 3, 1, 0, 3, 2, 0, 0]))
作图观察数据
方法一:利用dataFrame
data=pd.DataFrame(X,columns=['x1','x2'])
data['label']=y
data.head()
"""
x1 x2 label
0 8.367166 2.149031 3
1 1.490715 1.164680 2
2 -6.362930 -3.301671 1
3 6.187906 0.952305 3
4 1.248463 -1.975853 0
"""
画图:
plt.scatter(data[data['label']==0].x1,data[data['label']==0].x2,label=0)
plt.scatter(data[data['label']==1].x1,data[data['label']==1].x2,label=1)
plt.scatter(data[data['label']==2].x1,data[data['label']==2].x2,label=2)
plt.scatter(data[data['label']==3].x1,data[data['label']==3].x2,label=3)
plt.legend()
方法二:利用 numpy 作图
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.scatter(X[y==3,0],X[y==3,1])
预测
new = np.array([-2.5,2])
Knn_C(X,y,new,3) # 2.0
调用sklearn库中kNN算法解决问题
导入库:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
实例化传入数据
#首先实例化
KNN_C = KNeighborsClassifier(n_neighbors= 5)
#运用fit方法传入数据集
KNN_C.fit(X,y)
预测:
#预测
KNN_C.predict(np.array(([-2.5,2],[0,-4]))) # array([2, 3])
2.在鸢尾花数据集上的应用
2.1数据集介绍
导入数据集:
#用数据调包,使用KNN模型
from sklearn.datasets import load_iris
iris = load_iris()
观察数据:
data = pd.DataFrame(iris.data,columns=iris.feature_names)
data.head()
添加标签列:
iris.target
data['label']=iris.target
3中不同鸢尾花:
plt.scatter(data.loc[data['label']==0,'sepal length (cm)'],data.loc[data['label']==0,'petal length (cm)'])
plt.scatter(data.loc[data['label']==1,'sepal length (cm)'],data.loc[data['label']==1,'petal length (cm)'])
plt.scatter(data.loc[data['label']==2,'sepal length (cm)'],data.loc[data['label']==2,'petal length (cm)'])
2.2应用
划分数据集:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(iris.data,iris.target,test_size = 0.3)
预测(每次预测结果可能不同):
KNN = KNeighborsClassifier(3).fit(xtrain,ytrain)
KNN.score(xtest,ytest) # 0.9777777777777777
以准确率为标准,找最好的Kabs
scores = []
for i in np.arange(1,20):
Knn = KNeighborsClassifier(i).fit(xtrain,ytrain)
scores.append(Knn.score(xtest,ytest))
画出准确率的图像:
plt.plot(np.arange(1,20),scores)
plt.xticks(np.arange(1,20))
选择准确率高的k带入当做最终模型
knn = KNeighborsClassifier(5).fit(xtrain,ytrain)