KNN分类算法——预测入住位置示例
1. 导包
import pandas as pd
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import StandardScaler
from sklearn. neighbors import KNeighborsClassifier
2. 原始数据
data = pd. read_csv( "./data/facebook-v-predicting-check-ins/train.csv" )
print ( data. head( 10 ) )
row_id x y accuracy time place_id
0 0 0.7941 9.0809 54 470702 8523065625
1 1 5.9567 4.7968 13 186555 1757726713
2 2 8.3078 7.0407 74 322648 1137537235
3 3 7.3665 2.5165 65 704587 6567393236
4 4 4.0961 1.1307 31 472130 7440663949
5 5 3.8099 1.9586 75 178065 6289802927
6 6 6.3336 4.3720 13 666829 9931249544
7 7 5.7409 6.7697 85 369002 5662813655
8 8 4.3114 6.9410 3 166384 8471780938
9 9 6.3414 0.0758 65 400060 1253803156
3. 数据预处理
data = data. query( "x > 5.0 & x < 5.25 & y > 4.0 & y < 4.25" )
时间转换,分解时间戳字段为weekday与hour
time_value = pd. to_datetime( data[ "time" ] , unit= "s" )
print ( time_value)
time_value = pd. DatetimeIndex( time_value)
data[ "hour" ] = time_value. hour
data[ "weekday" ] = time_value. weekday
data = data. drop( [ "time" ] , axis= 1 )
print ( data. head( 10 ) )
plate_count = data. groupby( "place_id" ) . count( )
tf = plate_count[ plate_count. row_id > 5 ] . reset_index( )
data = data[ data[ "place_id" ] . isin( tf. place_id) ]
y = data[ "place_id" ]
X = data. drop( [ "place_id" , "row_id" ] , axis = 1 )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.25 )
print ( X_train[ 0 : 10 ] )
print ( y_train[ 0 : 10 ] )
std = StandardScaler( )
X_train = std. fit_transform( X_train)
X_test = std. fit_transform( X_test)
print ( X_train)
print ( X_test)
4. 构建KNN模型
knn = KNeighborsClassifier( n_neighbors= 5 )
knn. fit( X_train, y_train)
5. 结果预测与评分
y_predict = knn. predict( X_test[ 0 : 10 ] )
print ( "实际结果" , y_test[ 0 : 10 ] )
print ( "预测结果" , y_predict)
print ( "准确率" , knn. score( X_test, y_test) )
6. 使用网格搜索、交叉验证
knn = KNeighborsClassifier( )
param_grid = {
"n_neighbors" : [ 3 , 5 , 6 , 7 , 8 , 9 , 10 ]
}
gcv = GridSearchCV( knn, param_grid= param_grid, cv= 5 )
gcv. fit( X_train, y_train)
y_predict = gcv. predict( X_test[ 0 : 10 ] )
print ( "实际结果" , y_test[ 0 : 10 ] )
print ( "预测结果" , y_predict)
print ( "准确率" , gcv. score( X_test, y_test) )
print ( "最好的结果" , gcv. best_score_)
print ( "最好的模型" , gcv. best_estimator_)
print ( "交叉验证结果" )
print ( gcv. cv_results_)