摘要:k近邻算法主要通过计算预测样本x和训练集中样本的距离,并按距离从近到远取k个训练样本的y值,如果y值为1的数量多,则x的预测值为1;否则为0。值得一提的是,这里运用的主要数学知识是计算欧拉距离的数学公式,不过通过np的矩阵运算,算起来并不算复杂。这里自己写代码实现一个简单的k近邻算法,主要是为了理解算法的思想和具体的实现过程。如何使用scikit-learn封装的KNN以及KNN的标准封装代码
一、k近邻算法特点:
1.思想简单
2.涉及数学知识少
3.效果好
二、生成数据集
import numpy as np
import matplotlib.pyplot as plt
raw_data_X = [[3.39,2.33],
[3.11,1.78],
[1.34,3.37],
[3.58,4.68],
[2.28,2.87],
[7.42,4.70],
[5.75,3.53],
[9.17,2.51],
[7.79,3.42],
[7.94,0.79],]
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
#训练样本
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
#预测样本
x = np.array([8.09,3.37])
数据可视化:
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='g')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='r')
plt.scatter(x[0],x[1],color='b')
plt.show()
三、KNN算法的封装
import numpy as np
from math import sqrt
from collections import Counter
def KNN_classify(k,X_train,y_train,x):
assert 1<= k <=X_train.shape[0], "k must be valid"
assert X_train.shape[0] == y_train.shape[0], "the size of X_train must equal to the size of y_train"
assert X_train.shape[1] == x.shape[0], "the feature number of x must equal to X_train"
distances = [sqrt(np.sum(i-x)**2) for i in X_train]
nearset = np.argsort(distances)
topk_y = [y_train[i] for i in nearset[:k]]
votes = Counter(topk_y)
return votes.most_common(1)[0][0]
四、调用KNN算法
y_predict = KNN_classify(6,X_train,y_train,x)
结果y_predict=1
五、使用scikit-learn中的KNN
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=6)
kNN.fit(X_train,y_train)
#把向量变成矩阵
x = x.reshape(1,-1)
predict_y = kNN.predict(x)
print(predict_y)
结果:array([1])
六、KNN的标准封装代码
封装代码:
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self,k):
"""初始化kNN分类器"""
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self,X_train,y_train):
"""根据训练集X_train,y_train训练KNN分类器"""
assert X_train.shape[0] == y_train.shape[0], "the size of X_train must equal to the size of y_train"
assert self.k <= X_train.shape[0], "the size of X_train must be at least k"
self._X_train = X_train
self._y_train = y_train
return self
def predict(self,X_predict):
"给定待预测的数据集X_predict,返回表示X_predict的结果向量"
assert self._X_train is not None and self._y_train is not None, "must fit before predict"
assert X_predict.shape[1] == self._X_train.shape[1], "the feature number of X_predict must equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self,x):
"""给定单个待预测数据x,返回x的预测结果"""
assert x.shape[0] == self._X_train.shape[1], "the feature number of x must equal to X_train"
distances = [sqrt(np.sum(i-x)**2) for i in self._X_train]
nearset = np.argsort(distances)
topk_y = [self._y_train[i] for i in nearset[:self.k]]
votes = Counter(topk_y)
return votes.most_common(1)[0][0]
def _repr_(self):
return "KNN(K=%d)" % self.k
测试代码:
raw_data_X = [[3.39,2.33],
[3.11,1.78],
[1.34,3.37],
[3.58,4.68],
[2.28,2.87],
[7.42,4.70],
[5.75,3.53],
[9.17,2.51],
[7.79,3.42],
[7.94,0.79],]
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
X_predict = np.array([[8.09,3.37]])
knn.fit(X_train,y_train)
y_predict = knn.predict(X_predict)
结果:array([1])