Machine Learning：KNN

chairon

已于 2022-09-08 14:41:49 修改

阅读量244

点赞数

分类专栏：笔记机器学习文章标签： python numpy

于 2020-06-30 13:57:38 首次发布

本文链接：https://blog.csdn.net/chairon/article/details/107041016

版权

笔记同时被 2 个专栏收录

42 篇文章 1 订阅

订阅专栏

机器学习

6 篇文章 0 订阅

订阅专栏

KNN基础

import numpy as np
import matplotlib.pyplot as plt
raw_data_x=[[3.3,2.3],
            [3.1,1.7],
            [1.3,3.6],
            [3.5,4.6],
            [2.2,2.8],
            [7.4,4.6],
            [5.7,3.5],
            [9.1,2.5],
            [7.7,3.4],
            [7.9,0.7]
           ]
raw_data_y=[0,0,0,0,0,1,1,1,1,1]
x_train=np.array(raw_data_x)
y_train=np.array(raw_data_y)

x_train

array([[3.3, 2.3],
       [3.1, 1.7],
       [1.3, 3.6],
       [3.5, 4.6],
       [2.2, 2.8],
       [7.4, 4.6],
       [5.7, 3.5],
       [9.1, 2.5],
       [7.7, 3.4],
       [7.9, 0.7]])

y_train

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

plt.scatter(x_train[y_train==0,0],x_train[y_train==0,1],color='g')
plt.scatter(x_train[y_train==1,0],x_train[y_train==1,1],color='r')

<matplotlib.collections.PathCollection at 0x257688940c8>

在这里插入图片描述

x=np.array([8.0,3.3])

plt.scatter(x_train[y_train==0,0],x_train[y_train==0,1],color='g')
plt.scatter(x_train[y_train==1,0],x_train[y_train==1,1],color='r')
plt.scatter(x[0],x[1],color='b')

<matplotlib.collections.PathCollection at 0x12ae1649d88>

在这里插入图片描述

knn过程

from math import sqrt
distances=[]
for x_train in x_train:
    d=sqrt(np.sum((x_train-x)**2))
    distances.append(d)
  distances

[4.805205510693586,
 5.154609587543949,
 6.706713054842886,
 4.684015371452148,
 5.821511831131154,
 1.431782106327635,
 2.308679276123039,
 1.360147050873544,
 0.31622776601683783,
 2.601922366251537]

  distances= [sqrt( np.sum((x_train- x)** 2)) for x_train in x_train ]
distances

[4.805205510693586,
 5.154609587543949,
 6.706713054842886,
 4.684015371452148,
 5.821511831131154,
 1.431782106327635,
 2.308679276123039,
 1.360147050873544,
 0.31622776601683783,
 2.601922366251537]

np.argsort(distances)

array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2], dtype=int64)

nearest=np.argsort(distances)
k=6
topK_y=[y_train[i] for i in nearest[:k]]
topK_y

[1, 1, 1, 1, 1, 0]

from collections import Counter
Counter(topK_y)

Counter({1: 5, 0: 1})

votes=Counter(topK_y)
votes.most_common(1)

[(1, 5)]

votes.most_common()求票数最多的几个，返回结果为一个二维数组

votes.most_common(1)[0][0]

predict_y=votes.most_common(1)[0][0]
predict_y

使用scikit-learn中的kNN

from sklearn.neighbors import KNeighborsClassifier
kNNClassifier=KNeighborsClassifier(n_neighbors=6)
raw_data_x=[[3.3,2.3],
            [3.1,1.7],
            [1.3,3.6],
            [3.5,4.6],
            [2.2,2.8],
            [7.4,4.6],
            [5.7,3.5],
            [9.1,2.5],
            [7.7,3.4],
            [7.9,0.7]
           ]
raw_data_y=[0,0,0,0,0,1,1,1,1,1]
x_train=np.array(raw_data_x)
y_train=np.array(raw_data_y)

kNNClassifier.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

kNNClassifier.predict(x)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-121-7fe027ee795c> in <module>
----> 1 kNNClassifier.predict(x)


E:\Anaconda\lib\site-packages\sklearn\neighbors\_classification.py in predict(self, X)
    169             Class labels for each data sample.
    170         """
--> 171         X = check_array(X, accept_sparse='csr')
    172 
    173         neigh_dist, neigh_ind = self.kneighbors(X)


E:\Anaconda\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    554                     "Reshape your data either using array.reshape(-1, 1) if "
    555                     "your data has a single feature or array.reshape(1, -1) "
--> 556                     "if it contains a single sample.".format(array))
    557 
    558         # in the future np.flexible dtypes will be handled like object dtypes


ValueError: Expected 2D array, got 1D array instead:
array=[8.  3.3].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

X_pridict=x.reshape(1, -1)
X_pridict

array([[8. , 3.3]])

kNNClassifier.predict(X_pridict)

array([1])

y_pridict=kNNClassifier.predict(X_pridict)
y_pridict[0]

重新整理代码

%run hh.py
knn_clf=KNNClassfier(k=6)
knn_clf.fit(x_train,y_train)
y_predict=knn_clf.predict(X_pridict)
y_predict

array([None], dtype=object)

测试我们的算法

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris=datasets.load_iris()
x=iris.data
y=iris.target
x.shape

(150, 4)

y.shape

(150,)

train_test_split

将x训练集中的元素进行乱序处理，返回索引

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

shuffle_indexes=np.random.permutation(len(x))
shuffle_indexes

array([139,  10,   7,  72,   2,  41,  43, 142,  32,  85, 100,  98, 111,
        91,  14,  96, 120,  35,  69, 103,   4,  31,  49,  18, 147,  92,
        87,  16,  97,  54,  23,  37,  77,  73,  81,  63,  15, 132, 115,
        74,  75,  17,   0, 124,  64, 131,  33,   3, 135,  82,  22,  34,
       107,  80,  48, 121,  53, 146,  83, 137,  19,  60, 140, 149,  38,
       143, 136,   5, 113,   6, 127,  42, 123,  56, 138,  61,  88, 117,
       114,  44,  55,  90, 105,  57,  78, 118,  94,  30,  51,  25, 108,
        29, 119,  13,  67,  40,  39, 122, 110, 101, 116,  24, 145,  79,
        84,  70,  45,  21,   8,   1, 141, 130,  86, 109,  52, 144,  59,
        20,  26,  27,  50,  47,  95,   9, 104, 128,  71, 125,  68, 148,
       126, 133,  93,  11,  28, 102, 112, 134,  36,  46,  65,  66,  89,
        58, 106,  76,  62,  99, 129,  12])

测试数据集

test_ratio=0.2
test_size=int(len(x)*test_ratio)
test_size

test_indexes=shuffle_indexes[:test_size]
train_indexes=shuffle_indexes[test_size:]

x_train=x[train_indexes]
y_train=y[train_indexes]
x_test=x[test_indexes]
y_test=y[test_indexes]

print(x_train.shape)
print(y_train.shape)

(120, 4)
(120,)

print(x_test.shape)
print(y_test.shape)

(30, 4)
(30,)

%run learning/sknn/Sknn.py
my_knn_clf=KNNClassfier(k=3)
my_knn_clf.fit(x_train,y_train)
y_predict =my_knn_clf.predict(x_test)
y_predict

array([None, None, None, None, None, None, None, None, None, None, None,
       None, None, None, None, None, None, None, None, None, None, None,
       None, None, None, None, None, None, None, None], dtype=object)

y_test

array([2, 0, 0, 1, 0, 0, 0, 2, 0, 1, 2, 1, 2, 1, 0, 1, 2, 0, 1, 2, 0, 0,
       0, 0, 2, 1, 1, 0, 1, 1])

sum(y_predict==y_test)/len(y_test)

0.0

sklearn 中的train_test_split

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=666)
print(x_train.shape)
print(y_train.shape)