KNN基础
import numpy as np
import matplotlib. pyplot as plt
raw_data_x= [ [ 3.3 , 2.3 ] ,
[ 3.1 , 1.7 ] ,
[ 1.3 , 3.6 ] ,
[ 3.5 , 4.6 ] ,
[ 2.2 , 2.8 ] ,
[ 7.4 , 4.6 ] ,
[ 5.7 , 3.5 ] ,
[ 9.1 , 2.5 ] ,
[ 7.7 , 3.4 ] ,
[ 7.9 , 0.7 ]
]
raw_data_y= [ 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 ]
x_train= np. array( raw_data_x)
y_train= np. array( raw_data_y)
x_train
array([[3.3, 2.3],
[3.1, 1.7],
[1.3, 3.6],
[3.5, 4.6],
[2.2, 2.8],
[7.4, 4.6],
[5.7, 3.5],
[9.1, 2.5],
[7.7, 3.4],
[7.9, 0.7]])
y_train
array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
plt. scatter( x_train[ y_train== 0 , 0 ] , x_train[ y_train== 0 , 1 ] , color= 'g' )
plt. scatter( x_train[ y_train== 1 , 0 ] , x_train[ y_train== 1 , 1 ] , color= 'r' )
<matplotlib.collections.PathCollection at 0x257688940c8>
x= np. array( [ 8.0 , 3.3 ] )
plt. scatter( x_train[ y_train== 0 , 0 ] , x_train[ y_train== 0 , 1 ] , color= 'g' )
plt. scatter( x_train[ y_train== 1 , 0 ] , x_train[ y_train== 1 , 1 ] , color= 'r' )
plt. scatter( x[ 0 ] , x[ 1 ] , color= 'b' )
<matplotlib.collections.PathCollection at 0x12ae1649d88>
knn过程
from math import sqrt
distances= [ ]
for x_train in x_train:
d= sqrt( np. sum ( ( x_train- x) ** 2 ) )
distances. append( d)
distances
[4.805205510693586,
5.154609587543949,
6.706713054842886,
4.684015371452148,
5.821511831131154,
1.431782106327635,
2.308679276123039,
1.360147050873544,
0.31622776601683783,
2.601922366251537]
distances= [ sqrt( np. sum ( ( x_train- x) ** 2 ) ) for x_train in x_train ]
distances
[4.805205510693586,
5.154609587543949,
6.706713054842886,
4.684015371452148,
5.821511831131154,
1.431782106327635,
2.308679276123039,
1.360147050873544,
0.31622776601683783,
2.601922366251537]
np. argsort( distances)
array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2], dtype=int64)
nearest= np. argsort( distances)
k= 6
topK_y= [ y_train[ i] for i in nearest[ : k] ]
topK_y
[1, 1, 1, 1, 1, 0]
from collections import Counter
Counter( topK_y)
Counter({1: 5, 0: 1})
votes= Counter( topK_y)
votes. most_common( 1 )
[(1, 5)]
votes.most_common()求票数最多的几个,返回结果为一个二维数组
votes. most_common( 1 ) [ 0 ] [ 0 ]
1
predict_y= votes. most_common( 1 ) [ 0 ] [ 0 ]
predict_y
1
使用scikit-learn中的kNN
from sklearn. neighbors import KNeighborsClassifier
kNNClassifier= KNeighborsClassifier( n_neighbors= 6 )
raw_data_x= [ [ 3.3 , 2.3 ] ,
[ 3.1 , 1.7 ] ,
[ 1.3 , 3.6 ] ,
[ 3.5 , 4.6 ] ,
[ 2.2 , 2.8 ] ,
[ 7.4 , 4.6 ] ,
[ 5.7 , 3.5 ] ,
[ 9.1 , 2.5 ] ,
[ 7.7 , 3.4 ] ,
[ 7.9 , 0.7 ]
]
raw_data_y= [ 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 ]
x_train= np. array( raw_data_x)
y_train= np. array( raw_data_y)
kNNClassifier. fit( x_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=6, p=2,
weights='uniform')
kNNClassifier. predict( x)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-121-7fe027ee795c> in <module>
----> 1 kNNClassifier.predict(x)
E:\Anaconda\lib\site-packages\sklearn\neighbors\_classification.py in predict(self, X)
169 Class labels for each data sample.
170 """
--> 171 X = check_array(X, accept_sparse='csr')
172
173 neigh_dist, neigh_ind = self.kneighbors(X)
E:\Anaconda\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
554 "Reshape your data either using array.reshape(-1, 1) if "
555 "your data has a single feature or array.reshape(1, -1) "
--> 556 "if it contains a single sample.".format(array))
557
558 # in the future np.flexible dtypes will be handled like object dtypes
ValueError: Expected 2D array, got 1D array instead:
array=[8. 3.3].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
X_pridict= x. reshape( 1 , - 1 )
X_pridict
array([[8. , 3.3]])
kNNClassifier. predict( X_pridict)
array([1])
y_pridict= kNNClassifier. predict( X_pridict)
y_pridict[ 0 ]
1
重新整理代码
% run hh. py
knn_clf= KNNClassfier( k= 6 )
knn_clf. fit( x_train, y_train)
y_predict= knn_clf. predict( X_pridict)
y_predict
array([None], dtype=object)
测试我们的算法
import numpy as np
import matplotlib. pyplot as plt
from sklearn import datasets
iris= datasets. load_iris( )
x= iris. data
y= iris. target
x. shape
(150, 4)
y. shape
(150,)
train_test_split
将x训练集中的元素进行乱序处理,返回索引
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
shuffle_indexes= np. random. permutation( len ( x) )
shuffle_indexes
array([139, 10, 7, 72, 2, 41, 43, 142, 32, 85, 100, 98, 111,
91, 14, 96, 120, 35, 69, 103, 4, 31, 49, 18, 147, 92,
87, 16, 97, 54, 23, 37, 77, 73, 81, 63, 15, 132, 115,
74, 75, 17, 0, 124, 64, 131, 33, 3, 135, 82, 22, 34,
107, 80, 48, 121, 53, 146, 83, 137, 19, 60, 140, 149, 38,
143, 136, 5, 113, 6, 127, 42, 123, 56, 138, 61, 88, 117,
114, 44, 55, 90, 105, 57, 78, 118, 94, 30, 51, 25, 108,
29, 119, 13, 67, 40, 39, 122, 110, 101, 116, 24, 145, 79,
84, 70, 45, 21, 8, 1, 141, 130, 86, 109, 52, 144, 59,
20, 26, 27, 50, 47, 95, 9, 104, 128, 71, 125, 68, 148,
126, 133, 93, 11, 28, 102, 112, 134, 36, 46, 65, 66, 89,
58, 106, 76, 62, 99, 129, 12])
测试数据集
test_ratio= 0.2
test_size= int ( len ( x) * test_ratio)
test_size
30
test_indexes= shuffle_indexes[ : test_size]
train_indexes= shuffle_indexes[ test_size: ]
x_train= x[ train_indexes]
y_train= y[ train_indexes]
x_test= x[ test_indexes]
y_test= y[ test_indexes]
print ( x_train. shape)
print ( y_train. shape)
(120, 4)
(120,)
print ( x_test. shape)
print ( y_test. shape)
(30, 4)
(30,)
% run learning/ sknn/ Sknn. py
my_knn_clf= KNNClassfier( k= 3 )
my_knn_clf. fit( x_train, y_train)
y_predict = my_knn_clf. predict( x_test)
y_predict
array([None, None, None, None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None], dtype=object)
y_test
array([2, 0, 0, 1, 0, 0, 0, 2, 0, 1, 2, 1, 2, 1, 0, 1, 2, 0, 1, 2, 0, 0,
0, 0, 2, 1, 1, 0, 1, 1])
sum ( y_predict== y_test) / len ( y_test)
0.0
sklearn 中的train_test_split
from sklearn. model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split( x, y, test_size= 0.2 , random_state= 666 )
print ( x_train. shape)
print ( y_train. shape)
(120, 4)
(120,)
print ( x_test. shape)
print ( y_test. shape)
(30, 4)
(30,)
疑问:
自己写的.py文件导入为什么总是失败,用import没法导入,用%run魔法命令导入,结果为None,伤脑筋。