import numpy as np
from sklearn import datasets
import matplotlib. pyplot as plt
X, y = datasets. make_moons( n_samples= 100 , noise = 0.3 )
plt. scatter( X[ y== 0 , 0 ] , X[ y== 0 , 1 ] , color = 'r' )
plt. scatter( X[ y== 1 , 0 ] , X[ y== 1 , 1 ] , color = 'b' )
plt. show( )
数据集分割
from sklearn. model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 666 , shuffle= True )
plt. scatter( X_train[ y_train== 0 , 0 ] , X_train[ y_train== 0 , 1 ] )
plt. scatter( X_train[ y_train== 1 , 0 ] , X_train[ y_train== 1 , 1 ] )
plt. scatter( X_test[ y_test== 0 , 0 ] , X_test[ y_test== 0 , 1 ] )
plt. scatter( X_test[ y_test== 1 , 0 ] , X_test[ y_test== 1 , 1 ] )
plt. show( )
交叉验证
from sklearn. model_selection import cross_val_score
from sklearn. neighbors import KNeighborsClassifier
knn = KNeighborsClassifier( n_neighbors= 5 )
cross_val_score( knn, X, y, cv = 3 )
array([0.88235294, 0.85294118, 0.875 ])
使用交叉验证获得最优参数
% % time
best_k, best_p, best_score = 0 , 0 , 0
for k in range ( 2 , 11 ) :
for p in range ( 1 , 6 ) :
knn_clf = KNeighborsClassifier( weights= "distance" , n_neighbors= k, p= p)
scores = cross_val_score( knn_clf, X_train, y_train)
score = np. mean( scores)
if score > best_score:
best_k, best_p, best_score = k, p, score
print ( "Best K =" , best_k)
print ( "Best P =" , best_p)
print ( "Best Score =" , best_score)
Best K = 8
Best P = 3
Best Score = 0.8874643874643874
Wall time: 236 ms
Grid超参数搜索
from sklearn. model_selection import GridSearchCV
from sklearn. neighbors import KNeighborsClassifier
param_grid= [
{
'weights' : [ 'uniform' ] ,
'n_neighbors' : [ i for i in range ( 1 , 11 ) ]
} ,
{
'weights' : [ 'distance' ] ,
'n_neighbors' : [ i for i in range ( 1 , 11 ) ] ,
'p' : [ i for i in range ( 1 , 6 ) ]
}
]
grid_search = GridSearchCV( KNeighborsClassifier( ) , param_grid)
% % time
grid_search. fit( X_train, y_train)
Wall time: 375 ms
GridSearchCV(cv=None, error_score='raise',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform'),
fit_params=None, iid=True, n_jobs=1,
param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=0)
grid_search. best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=8, p=3,
weights='distance')
grid_search. best_score_
0.8875
% % time
grid_search = GridSearchCV( KNeighborsClassifier( ) , param_grid, n_jobs= - 1 , verbose= 4 )
grid_search. fit( X_train, y_train)
Fitting 3 folds for each of 60 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 3.9s
Wall time: 4.41 s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 4.1s finished
grid_search. best_params_
{'n_neighbors': 8, 'p': 3, 'weights': 'distance'}
knn_clf = grid_search. best_estimator_
knn_clf. fit( X_train, y_train)
knn_clf. score( X_test, y_test)
0.9
在网格搜索中增加交叉验证
GridSearchCV( KNeighborsClassifier( ) , param_grid, n_jobs= - 1 , verbose= 4 , cv = 4 )
GridSearchCV(cv=4, error_score='raise',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform'),
fit_params=None, iid=True, n_jobs=-1,
param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=4)