KNN算法
K近邻算法
一、KNN基础
# 导入库
import numpy as np
import matplotlib.pyplot as plt
# 特征值及目标值
raw_data_x = [[3.54,2.54],
[4.52,7.36],
[1.34,3.36],
[2.28,2.86],
[7.42,4.69],
[5.74,3.53],
[9.17,2.51],
[7.79,3.42],
[7.93,0.79]
]
raw_data_y = [0,0,0,0,0,1,1,1,1]
# 作为X的训练集,y作为特征值
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
# 绘图
plt.figure(figsize=(10,8))
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='g')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='r')
plt.show()
# 绘制X点
x = np.array([8.09,3.36])
plt.figure(figsize=(10,8))
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='g')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='r')
plt.scatter(x[0],x[1],color='b')
plt.show()
二、KNN 代码实现的过程
# 导入库
from math import sqrt
# 建立distance的空链表
distance = []
# 遍历X_train
for x_train in X_train:
# 欧拉距离,sqrt开方
d = sqrt(np.sum((x_train - x)**2))
# 将每个距离添加到distance链表中去,其索引对应x_train的索引
distance.append(d)
或者
distances = [sqrt(np.sum((x_train-x)**2)) for x_train in X_train ]
distance
- 使用np.argsort()函数
- 对一个数据进行升序排序【默认】,返回其数据的排列的索引值
np.argsort(distances)
# 对一个数组进行排序,但是返回的是一个数组的索引值
# nearest 接收排列的索引值
nearest = np.argsort(distances)
k = 6
nearest[:6]
- nearest[7]最近 = 0.30594
- nearest[6]第二近 =
top_y = [y_train[i] for i in nearest[:k]]
top_y
from collections import Counter
Counter(top_y)
votes = Counter(top_y)
votes.most_common(1)
votes.most_common(1)[0][0]
import numpy as np
from math import sqrt
from collections import Counter
def KNN_classifier(k,X_train,y_train,x):
'''
assert 1<=k<=X_train.shape[0],"K must be valid"
assert X_train.shape[0] == y_train.shape[0]
"the size of X_train must equal to the size of y_train"
assert X_train.shape[1] == x.shape[0].
"the feature number of x must be equal to X_train "
'''
# 求x到X_train的欧拉距离,平方累加
distance = [sqrt(np.sum((x_train-x)**2)) for x_train in X_train]
# nearest接收 distance的排序索引返回值【默认递增】
nearest = np.argsort(distance)
# 取nearest[:k]的 y_train的前k个值
top_y = [y_train[i] for i in nearest[:k]]
# 对top_y计数,返回字典格式的votes。按值降序排列
votes = Counter(top_y)
# votes.most_common(1) 返回votes字典的第一个键值对
# 如果mots_common(2) 则返回votes字典的前两个键值对
# [0][0] 返回键值对的 第一个列表的 第0个值
return votes.most_common(1)[0][0]
predict_y = KNN_classifier(6,X_train,y_train,x)
predict_y
三、总结
- K近邻算法是非常特殊的,可以是被认为是没有模型的算法
- 为了和其他算法统一,可以认为训练数据集就是模型本身
使用scikit-learn中的KNN
from sklearn.neighbors import KNeighborsClassifier
KNN_classifier = KNeighborsClassifier(n_neighbors=6)
KNN_classifier.fit(X_train,y_train)
# 传入矩阵数据,-1让其自动决定有多少列
X_predict = x.reshape(1,-1)
X_predict
y_predict = KNN_classifier.predict(X_predict)
y_predict[0]
四、KNN算法的封装
import numpa as np
from math import sqrt
from collections import Counter
from .metrics import accuracy_score
class KNNClassifier:
def __init__(self, k):
"""初始化kNN分类器"""
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
"""根据训练数据集X_train和y_train训练kNN分类器"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert self.k <= X_train.shape[0], \
"the size of X_train must be at least k."
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self._X_train is not None and self._y_train is not None, \
"must fit before predict!"
assert X_predict.shape[1] == self._X_train.shape[1], \
"the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个待预测数据x,返回x的预测结果值"""
assert x.shape[0] == self._X_train.shape[1], \
"the feature number of x must be equal to X_train"
distances = [sqrt(np.sum((x_train - x) ** 2))
for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return "KNN(k=%d)" % self.k
knn_clf = KNNclassifier(k=6)
knn_clf.fit(X_train,y_train)
y_predict = knn_clf.predict(X_predict)
y_predict[0]
1、对索引进行随机排列
# 对数据长度len(X)的值 进行随机排列,作为各个索引的随机值
shuffle_indexes = np.random.permutation(len(X))
shuffle_indexes
# 定义比例
test_ratio = 0.2
test_size = int(len(X)*test_ratio)
test_indexes = shuffle_indexes[:test_size] # 训练集的索引
train_indexes = shuffle_indexes[test_size:] # 测试集的索引
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
- 封装
train_test_split
算法
import numpy as np
def train_test_split(X,y,test_ratio=0.2,seed = None):
"""将数据X和y按照test_ratio分割位X_train,X_test,y_train,y_test"""
assert X.shape[0] == y.shape[0],\
"the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0 ,\
"test_ratio must be valid"
if seed:
np.random.seed(seed)
shuffle_indexes = np.random.permutation(len(X))
test_size = int(len(X)*test_ratio)
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train,X_test,y_train,y_test
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)
print("--------------------")
print(X_test.shape)
print(y_test.shape)
my_knn_clf = KNNclassifier(k=3)
my_knn_clf.fit(X_train,y_train)
y_predict = my_knn_clf.predict(X_test)
y_predict
acc = (sum([1 if y_predict[i] == y_test[i] else 0 for i in range(len(y_test))]))/y_test.shape[0]
acc
train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state=666)
print(X_train.shape)
print(y_train.shape)
print("--------------------")
print(X_test.shape)
print(y_test.shape)
分类准确率
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets
digits = datasets.load_digits()
digits.keys()
some_digit = X[666]
y[666]
some_digit_image = some_digit.reshape(8,8)
plt.imshow(some_digit_image,cmap = matplotlib.cm.binary)
plt.show()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
my_knn_clf1 = KNNclassifier(k=3)
my_knn_clf1.fit(X_train,y_train)
y_predict = my_knn_clf1.predict(X_test)
sklearn中的导入
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
y_predict = knn_clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)
knn_clf.score(X_test,y_test)
- train_test_split
- X_train 训练集的特征值
- X_test 测试集的特征值
- y_train 训练集的目标值
- y_test 测试集的目标值
五、超参数
5.1 超参数和模型参数
-
超参数:在算法运行前需要决定的参数
-
模型参数:算法过程中需要学习的参数
-
KNN算法没有模型参数
-
KNN算法中的K是典型的超参数
5.2 如何去寻找好的超参数?
- 领域知识
- 经验数值
- 实验搜索
5.3 开始调参
# 初始化定义best_score和best_k
best_score = 0.0
best_k = -1
for k in range(1,11):
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X_train,y_train)
score = knn_clf.score(X_test,y_test)
if score > best_score:
best_k = k
best_score = score
print("best_k = ",best_k)
print("best_score = ",best_score)
- 设置不同的k值,查看准确率
5.4 KNN算法是否考虑距离?
best_method = ""
best_score = 0.0
best_k = -1
for method in ["uniform","distance"]:
for k in range(1,11):
knn_clf = KNeighborsClassifier(n_neighbors=k,weights=method)
knn_clf.fit(X_train,y_train)
score = knn_clf.score(X_test,y_test)
if score > best_score:
best_k = k
best_score = score
best_method = method
print("best_method=",best_method)
print("best_k = ",best_k)
print("best_score = ",best_score)
- weights = “uniform” 不考虑距离
- weights = “distance” 考虑距离
六、距离
- p = 1 p=1 p=1 ,为曼哈顿距离
- p = 2 p=2 p=2 ,为欧拉距离
6.1 搜索明可夫斯基对应的P
%%time
best_p = -1
best_score = 0.0
best_k = -1
for k in range(1,11):
for p in range(1,6):
knn_clf = KNeighborsClassifier(n_neighbors=k,weights = "distance",p=p)
knn_clf.fit(X_train,y_train)
score = knn_clf.score(X_test,y_test)
if score > best_score:
best_k = k
best_score = score
best_p = p
print("best_p = ",best_p)
print("best_k = ",best_k)
print("best_score = ",best_score)
- 更多距离定义
七、网格搜索
- 导入数据、导入库、训练模型
import numpy as np
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=666)
from sklearn.neighbors import KNeighborsClassifier
sk_knn_clf = KNeighborsClassifier(n_neighbors=4,weights="uniform")
sk_knn_clf.fit(X_train,y_train)
sk_knn_clf.score(X_test,y_test)
- 添加参数
param_grid = [
{
'weights':['uniform'],
'n_neighbors':[i for i in range(1,11)]
},
{
'weights':['distance'],
'n_neighbors':[i for i in range(1,11)],
'p':[i for i in range(1,6)]
}
]
- 引入Knn算法,导入网格搜索GridSearchCV
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid)
%%time
grid_search.fit(X_train,y_train)
%%time
grid_search = GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)
# n_jobs使用计算机的核,-1表示使用计算机的所有的核
# verbose在训练过程中,输出信息
grid_search.fit(X_train,y_train)
八、数据归一化
8.1 归一化
- 将所有数据映射到0-1之间
normalization
- 适用于有明显的边界情况下
- 受outlier影响较大
x = np.random.randint(0,100,size=100)
x
(x-np.min(x))/(np.max(x)-np.min(x))
x = np.random.randint(0,100,(50,2))# 随机生成50*2的矩阵
x
x = np.array(x,dtype=float)
x[:,0] = (x[:,0]-np.min(x[:,0]))/(np.max(x[:,0])-np.min(x[:,0]))
x[:,0]
x[:,1] = (x[:,1]-np.min(x[:,1]))/(np.max(x[:,1])-np.min(x[:,1]))
x[:,1]
plt.scatter(x[:,0],x[:,1])
plt.show()
8.2 均值方差归一化standardization
- 数据分布没有明显的边界
- 有可能存在极端数据值
- 将所有的数据归一到均值为0,方差为1的分布
x2 = np.random.randint(0,100,(50,2))
x2 = np.array(x2,dtype=float)
x2[:,0] = (x2[:,0] - np.mean(x2[:,0]))/np.std(x2[:,0])
x2[:,1] = (x2[:,1] - np.mean(x2[:,1]))/np.std(x2[:,1])
plt.scatter(x2[:,0],x2[:,1])
plt.show()
-
均值接近为0,方差为1
Q:如何对训练集和测试集进行归一化
-
用训练集的均值和方差 去归一化测试集,图中
mean_train
和std_train
都是训练集的均值和方差 -
why?
- 测试数据是模拟真实环境
- 真实环境很有可能无法得到所有测试数据的均值和方差
- 对数据进行归一化也是算法的一部分
- 测试数据是模拟真实环境
sklearn中的StandarScaler
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(X_train)
X_train = standardscaler.transform(X_train)
X_test_standard = standardscaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
knn_clf3 = KNeighborsClassifier(n_neighbors=3)
knn_clf3.fit(X_train,y_train)
knn_clf3.score(X_test_standard,y_test)
- 归一化预测结果准备率高达1
knn_clf3.score(X_test,y_test)
- 忘记归一化导致预测结果不准确
九、KNN总结
9.1 优点
- 解决分类问题
- 天然可以解决多分类问题
- 思想简单,效果强大
- 可以使用K近邻算法解决回归问题
- 离他最近的K个点的平均值
9.2 缺点
- 效率低下
- 如果训练集有m个样本,n个特征,则预测每一个新的数据,需要O(m*n)
- 优化,使用树结构:KD-tree
- 高度数据相关
- 预测结果不具有可解释性
- 维数灾难
- 解决方法:降维(PCA)
- 解决方法:降维(PCA)