1.基础概念
- kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别,则该样本也属于这个类别,并具有这个类别上样本的特性。该方法在确定分类决策上只依据最邻近的一个或者几个样本的类别来决定待分样本所属的类别。 kNN方法在类别决策时,只与极少量的相邻样本有关。由于kNN方法主要靠周围有限的邻近的样本,而不是靠判别类域的方法来确定所属类别的,因此对于类域的交叉或重叠较多的待分样本集来说,kNN方法较其他方法更为适合。
- KNN算法不仅可以用于分类,还可以用于回归。通过找出一个样本的k个最近邻居,将这些邻居的属性的平均值赋给该样本,就可以得到该样本的属性。更有用的方法是将不同距离的邻居对该样本产生的影响给予不同的权值(weight),如权值与距离成反比。
3.
2.kNN实现过程(jupyter notebook中实现)
- 其算法的描述为:
1)计算测试数据与各个训练数据之间的距离;
2)按照距离的递增关系进行排序;
3)选取距离最小的K个点;
4)确定前K个点所在类别的出现频率;
5)返回前K个点中出现频率最高的类别作为测试数据的预测分类。 - 用自己的测试数据模拟knn以及其实现过程
1)生成的数据
import numpy as np
import matplotlib.pyplot as plt
raw_data_X = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679179110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745051997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783481, 3.424088941],
[7.939820817, 0.791637231]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
2)数据可视化
(1)样本数据
plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g')
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')
plt.show()
2)预测某一点属于的类别
x = np.array([8.093607318, 3.365731514])
plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g')
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')
plt.scatter(x[0], x[1], color='b')
plt.show()
3)实现过程
(1)计算每个样本点和预测点之间的距离
from math import sqrt
distances = []
for x_train in X_train:
d = sqrt(np.sum((x_train - x)**2))
distances.append(d)
# 或使用
distances = [sqrt(np.sum((x_train - x)**2))
for x_train in X_train]
(2)从小到大记下距离的下标值
nearest = np.argsort(distances)
(3)设置KNN中的超参数k
k = 6
(4)找出对应最近k个的标签y的值的列表
topK_y = [y_train[neighbor] for neighbor in nearest[:k]]
(5)记录每个标签下对应的有几个个数,并用字典表示
from collections import Counter
votes = Counter(topK_y)
(6)统计k中最多的属于类的标签,做出预测
predict_y = votes.most_common(1)[0][0]
3.自己写的KNN分类算法
1.自己实现的KNN分类算法
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
"""初始化kNN分类器"""
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
"""根据训练数据集X_train和y_train训练kNN分类器"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert self.k <= X_train.shape[0], \
"the size of X_train must be at least k."
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self._X_train is not None and self._y_train is not None, \
"must fit before predict!"
assert X_predict.shape[1] == self._X_train.shape[1], \
"the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个待预测数据x,返回x的预测结果值"""
assert x.shape[0] == self._X_train.shape[1], \
"the feature number of x must be equal to X_train"
distances = [sqrt(np.sum((x_train - x) ** 2))
for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return "KNN(k=%d)" % self.k
注:jupyter中可以使用import导入实现,或者通过%run +文件名来导入模块来使用该模块
2.自己实现的对于训练数据和测试数据分离代码
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
iris.keys()
X = iris.data
y = iris.target
X.shape
y.shape
y
shuffled_indexes = np.random.permutation(len(X)) # 乱序排索引
shuffled_indexes
test_ratio = 0.2 # 测试集所占的比例
test_size = int(len(X) * test_ratio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_train = X[train_indexes] # fancy indexing
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
注:实现了以iris数据集为例手动随机分离测试数据集和训练数据集
3.分类准确度的计算accucy_score
sum(y_predict == y_test) / len(y_test)
4.超参数
1)寻找最好的K值
best_score = 0.0
best_k = -1
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_k = k
best_score = score
print("best_k =", best_k)
print("best_score =", best_score)
2)考不考虑距离的权重 weights=“distance”(考虑)或“uniform”
best_score = 0.0
best_k = -1
best_method = ""
for method in ["uniform", "distance"]:
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_k = k
best_score = score
best_method = method
print("best_method =", best_method)
print("best_k =", best_k)
print("best_score =", best_score)
3)考虑距离之后对于p值和k的选取,对应的是搜索明可夫斯基距离相应的p
best_score = 0.0
best_k = -1
best_p = -1
for k in range(1, 11):
for p in range(1, 6):
knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_k = k
best_p = p
best_score = score
print("best_k =", best_k)
print("best_p =", best_p)
print("best_score =", best_score)
4)数据的归一化处理
(1)均值的归一化
(x - np.min(x)) / (np.max(x) - np.min(x))
X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))
X[:,1] = (X[:,1] - np.min(X[:,1])) / (np.max(X[:,1]) - np.min(X[:,1]))
(2)均值方差的归一化
X2[:,0] = (X2[:,0] - np.mean(X2[:,0])) / np.std(X2[:,0])
X2[:,1] = (X2[:,1] - np.mean(X2[:,1])) / np.std(X2[:,1])
4.scikit-learn的KNN分类算法
1.搭建KNN分类器
from sklearn.neighbors import KNNClassifier
my_knn_clf = KNNClassifier(k_neighbors=3)
my_knn_clf.fit(X_train, y_train)
y_predict = my_knn_clf.predict(X_test)
2.训练数据集和测试数据集的分离
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
注:要设置随机种子,测试样本所占比例
3.分类准确度的计算
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)
#或者是score
knn_clf.score(X_test, y_test)
4.网格搜索法寻找最优超参数 GridSearchCV
param_grid = [
{
'weights': ['uniform'],
'n_neighbors': [i for i in range(1, 11)]
},
{
'weights': ['distance'],
'n_neighbors': [i for i in range(1, 11)],
'p': [i for i in range(1, 6)]
}
]
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf, param_grid)
%%time
grid_search.fit(X_train, y_train)
grid_search.best_estimator_ # 最好的KNN分类器
grid_search.best_score_ #对应的准确率
knn_clf1 = grid_search.best_estimator_ #可以设置其分类器的最好结果
%%time
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
注:GridSearchCV寻找的时候可以并行处理,n_jobs设置并行处理的核,verbose设置输出打出的参数个数
5.数据的标准化
from sklearn.preprocessing import StandardScaler
standardScalar = StandardScaler()
standardScalar.fit(X_train)
standardScalar.mean_
standardScalar.scale_
standardScalar.transform(X_train)
X_train[:10,:]
X_train = standardScalar.transform(X_train)
X_train[:10,:]
X_test_standard = standardScalar.transform(X_test)
注:训练集在用transform标准化处理之后,测试集也要。
6.手写数字识别用knn分类的案例
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
y = digits.target
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_ratio=0.2, seed=666)
from playML.kNN import KNNClassifier
knn_clf = KNNClassifier(k=4)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)
knn_clf.score(X_test, y_test)
- KNN用于处理回归问题
from sklearn.neighbors import KNeighborsRegressor
用这个回归器解决回归问题: KNeighborsRegressor()