可以说kNN是机器学习中非常特殊的没有模型的算法,为了和其他算法统一,可以认为新联数据集就是模型本身
1. kNN算法基本实现
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter
# 特征集合
raw_data_x = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679179110],
[2.280362439, 2.866990263],
[7.423469421, 4.694522875],
[5.745051997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783481, 3.424088941],
[7.939820817, 0.791637231]]
# 每一个特征的类别
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 训练集
x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
# 来了一个新的数据 要判断它的特征值
new = np.array([8.093607318, 3.365731514])
# 原数据
plt.scatter(x_train[y_train == 0, 0], x_train[y_train == 0, 1], color='g')
plt.scatter(x_train[y_train == 1, 0], x_train[y_train == 1, 1], color='r')
# 新数据
plt.scatter(new[0], new[1], color='b')
# plt.show()
# 由图可知,它一定输入特征值为 1
# kNN的过程
distances = []
# np.sum((x - new) ** 2) 等价于 (x[0] - new[0]) ** 2 + (x[1] - new[1]) ** 2
for x in x_train:
d = sqrt(np.sum((x - new) ** 2))
distances.append(d)
# 一句话搞定
# distances = [sqrt(np.sum((x - new) ** 2)) for x in x_train]
nearest = np.argsort(distances)
K = 6
# 最近距离y坐标
topK_y = [y_train[i] for i in nearest[:K]]
# 投票过程
votes = Counter(topK_y)
# 预测结果值
predict_y = votes.most_common(1)[0][0]
print(predict_y)
2.函数
很容易把上述的过程整理出来写出一个函数
import numpy as np
from math import sqrt
from collections import Counter
def kNN_classify(k, x_train, y_train, new):
# 校验参数
assert 1 <= k <= x_train.shape[0], "k must be valid "
assert x_train.shape[0] == y_train.shape[0], "the size of x_train must equal to the size of y_train"
assert x_train.shape[1] == new.shape[0], "th feature number of x must be equal to x_train"
# 距离数组
distance = [sqrt(np.sum((x - new) ** 2)) for x in x_train]
nearest = np.argsort(distance)
topK_y = [y_train[i] for i in nearest[:k]]
# 投票
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
3.使用sklearn中的kNN算法
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
# 特征集合
raw_data_x = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679179110],
[2.280362439, 2.866990263],
[7.423469421, 4.694522875],
[5.745051997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783481, 3.424088941],
[7.939820817, 0.791637231]]
# 每一个特征的类别
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 训练集
x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
new = np.array([[8.093607318, 3.365731514]])
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(x_train, y_train)
print(kNN_classifier.predict(new))
4.模拟sklearn的方式使用面向对象的方式实现
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
""""初始化kNN分类器"""
assert 1 <= k, "k must be valid "
self.k = k
self._x_train = None
self._y_train = None
def fit(self, x_train, y_train):
""""根据训练数据集x_train,y_train训练kNN分类器"""
assert x_train.shape[0] == y_train.shape[0], "the size of x_train must equal to the size of y_train"
assert self.k <= x_train.shape[0], "th feature number of x must be equal to x_train"
self._x_train = x_train
self._y_train = y_train
return self
def predict(self, new):
""""给定待预测数据集new,返回表示NEW的结果向量"""
assert self._x_train is not None and self._y_train is not None, "must fit before predict!"
assert new.shape[1] == self._x_train.shape[1], "the feature number of new must be equal to x_train"
y_predict = [self._predict(x) for x in new]
return np.array(y_predict)
def _predict(self, x):
""""给定单个待预测数据x,返回x_predict的预测结果值"""
assert x.shape[0] == self._x_train.shape[1], "the feature number of x must be equal to x_train"
# 距离数组
distance = [sqrt(np.sum((i - x) ** 2)) for i in self._x_train]
nearest = np.argsort(distance)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
# 投票
votes = Counter(topK_y)
return votes.most_common(1)[0][0]