KNN
字段 | 含义 |
---|---|
income | 收入 |
attractive | 吸引力 |
assets | 资产 |
edueduclass | 教育程度 |
dated | 是否约会成功 |
income_rank | 收入等级 |
attractive_rank | 吸引力等级 |
assets_rank | 资产等级 |
- 加载数据集
import pandas as pd
import os
# os.chdir('Q:/data')
orgData = pd.read_csv('date_data2.csv')
orgData.describe()
income | attractive | assets | edueduclass | Dated | income_rank | attractive_rank | assets_rank | |
---|---|---|---|---|---|---|---|---|
count | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
mean | 9010.000000 | 50.500000 | 96.006300 | 3.710000 | 0.500000 | 1.550000 | 1.560000 | 1.510000 |
std | 5832.675288 | 28.810948 | 91.082226 | 1.225116 | 0.502519 | 1.140397 | 1.103896 | 1.123621 |
min | 3000.000000 | 1.000000 | 3.728400 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 5000.000000 | 28.000000 | 31.665269 | 3.000000 | 0.000000 | 1.000000 | 1.000000 | 0.750000 |
50% | 7500.000000 | 51.000000 | 70.746924 | 4.000000 | 0.500000 | 2.000000 | 2.000000 | 2.000000 |
75% | 11500.000000 | 68.875000 | 131.481061 | 4.000000 | 1.000000 | 3.000000 | 2.250000 | 2.250000 |
max | 34000.000000 | 99.500000 | 486.311758 | 6.000000 | 1.000000 | 3.000000 | 3.000000 | 3.000000 |
- 选取自变量
X = orgData.iloc[:, :4]
Y = orgData[['Dated']]
- 极值标准化
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_scaled[1:5]
array([[0. , 0.13705584, 0.07649535, 0.6 ],
[0. , 0.05076142, 0.00293644, 0. ],
[0. , 0. , 0.00691908, 0. ],
[0.01612903, 0.13705584, 0. , 0.2 ]])
- 划分训练集和测试集
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target = train_test_split(
X_scaled, Y, test_size=0.25, train_size=0.75, random_state=123)
- 建模
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3) # 默认欧氏距离
model.fit(train_data, train_target.values.flatten())
test_est = model.predict(test_data)
- 验证
import sklearn.metrics as metrics
print(metrics.classification_report(test_target, test_est))
precision recall f1-score support
0 0.92 0.92 0.92 12
1 0.92 0.92 0.92 13
avg / total 0.92 0.92 0.92 25
model.score(test_data, test_target)
0.92
- 选择k值
for k in range(1, 10):
k_model = KNeighborsClassifier(n_neighbors=k)
k_model.fit(train_data, train_target.values.flatten())
score = k_model.score(test_data, test_target)
print('When k=%s , the score is %.4f' %(k, score))
When k=1 , the score is 0.9200
When k=2 , the score is 0.8800
When k=3 , the score is 0.9200
When k=4 , the score is 0.9200
When k=5 , the score is 0.8800
When k=6 , the score is 0.8800
When k=7 , the score is 0.9200
When k=8 , the score is 0.8800
When k=9 , the score is 0.9200
- 交叉验证选择k值
from sklearn.model_selection import ParameterGrid, GridSearchCV
grid = ParameterGrid({'n_neighbors':[range(1,15)]})
estimator = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator, grid, cv=4, scoring='roc_auc')
knn_cv.fit(train_data, train_target.values.flatten())
knn_cv.best_params_
{'n_neighbors': 7}
knn_cv.score
<bound method BaseSearchCV.score of GridSearchCV(cv=4, error_score='raise',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform'),
fit_params=None, iid=True, n_jobs=1,
param_grid=<sklearn.model_selection._search.ParameterGrid object at 0x000000001169EC18>,
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='roc_auc', verbose=0)>
knn_cv.best_score_
0.9481481481481482
knn_cv.cv_results_
{'mean_fit_time': array([0.00050002, 0.00024998, 0.00049996, 0.00024998, 0.00050002,
0. , 0. , 0.00024998, 0.00049996, 0.00050008,
0.00025004, 0.00099993, 0.00175005, 0.00049996]),
'mean_score_time': array([0.00075001, 0.00125009, 0.00075006, 0.00100011, 0.00100005,
0.00100005, 0.00099999, 0.00075006, 0.00100017, 0.00099999,
0.00100011, 0.00150013, 0.00200009, 0.00225013]),
'mean_test_score': array([0.85244444, 0.90681481, 0.93562963, 0.92837037, 0.92688889,
0.94325926, 0.94814815, 0.93088889, 0.92422222, 0.93185185,
0.91585185, 0.91725926, 0.93296296, 0.94162963]),
'mean_train_score': array([1. , 0.98897083, 0.98438274, 0.9752876 , 0.9667673 ,
0.97197502, 0.96794222, 0.96384385, 0.95800081, 0.95163813,
0.95032495, 0.94546359, 0.94406611, 0.94205612]),
'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'n_neighbors': 1},
{'n_neighbors': 2},
{'n_neighbors': 3},
{'n_neighbors': 4},
{'n_neighbors': 5},
{'n_neighbors': 6},
{'n_neighbors': 7},
{'n_neighbors': 8},
{'n_neighbors': 9},
{'n_neighbors': 10},
{'n_neighbors': 11},
{'n_neighbors': 12},
{'n_neighbors': 13},
{'n_neighbors': 14}],
'rank_test_score': array([14, 13, 4, 8, 9, 2, 1, 7, 10, 6, 12, 11, 5, 3]),
'split0_test_score': array([0.8 , 0.84 , 0.85 , 0.845, 0.86 , 0.895, 0.935, 0.94 , 0.915,
0.955, 0.97 , 0.985, 0.99 , 0.995]),
'split0_train_score': array([1. , 0.99933862, 0.99470899, 0.99206349, 0.98544974,
0.98743386, 0.98280423, 0.97883598, 0.96296296, 0.96031746,
0.94378307, 0.9239418 , 0.91335979, 0.90806878]),
'split1_test_score': array([0.73333333, 0.9 , 0.93888889, 0.93888889, 0.90555556,
0.93333333, 0.92222222, 0.87222222, 0.87222222, 0.87777778,
0.82222222, 0.79444444, 0.82777778, 0.83333333]),
'split1_train_score': array([1. , 0.98979592, 0.98469388, 0.98852041, 0.97767857,
0.97066327, 0.97640306, 0.97321429, 0.97002551, 0.96938776,
0.97512755, 0.96875 , 0.97066327, 0.96237245]),
'split2_test_score': array([0.88888889, 0.90740741, 0.96296296, 0.95061728, 0.96296296,
0.96296296, 0.98148148, 0.97530864, 0.98765432, 0.98148148,
0.96296296, 0.98148148, 0.98765432, 1. ]),
'split2_train_score': array([1. , 0.98522167, 0.98214286, 0.9612069 , 0.9544335 ,
0.96859606, 0.95073892, 0.93965517, 0.93965517, 0.92610837,
0.93226601, 0.93780788, 0.93780788, 0.94027094]),
'split3_test_score': array([1. , 0.98765432, 1. , 0.98765432, 0.98765432,
0.98765432, 0.95679012, 0.9382716 , 0.92592593, 0.91358025,
0.90740741, 0.90740741, 0.92592593, 0.9382716 ]),
'split3_train_score': array([1. , 0.98152709, 0.97598522, 0.95935961, 0.94950739,
0.9612069 , 0.96182266, 0.96366995, 0.95935961, 0.95073892,
0.95012315, 0.95135468, 0.9544335 , 0.95751232]),
'std_fit_time': array([0.00050002, 0.00043298, 0.00049996, 0.00043298, 0.00050002,
0. , 0. , 0.00043298, 0.00049996, 0.00050008,
0.00043308, 0. , 0.00248759, 0.00049996]),
'std_score_time': array([4.33015741e-04, 4.33015741e-04, 4.33050154e-04, 1.03238273e-07,
1.19209290e-07, 1.19209290e-07, 1.03238273e-07, 4.33050154e-04,
0.00000000e+00, 1.03238273e-07, 1.03238273e-07, 4.99963760e-04,
7.07224118e-04, 1.08974147e-03]),
'std_test_score': array([0.09935694, 0.05263757, 0.05596217, 0.05332738, 0.05006896,
0.03481684, 0.02238234, 0.03715753, 0.04091529, 0.0394323 ,
0.05960289, 0.07783317, 0.0663336 , 0.06746183]),
'std_train_score': array([0. , 0.00666403, 0.00675025, 0.01507071, 0.01515022,
0.00959253, 0.01250868, 0.01498075, 0.01126527, 0.01614742,
0.01568543, 0.01657377, 0.02119514, 0.02127189])}
metrics.roc_auc_score(test_target,knn_cv.predict(test_data))
0.9198717948717948
练习:试一试哪些参数会影响结果
朴素贝叶斯
GaussianNB
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
gb = GaussianNB()
gb.fit(train_data, train_target.values.flatten())
gb_est = gb.predict(test_data)
print(metrics.classification_report(test_target, test_est))
precision recall f1-score support
0 0.92 0.92 0.92 12
1 0.92 0.92 0.92 13
avg / total 0.92 0.92 0.92 25
print(gb.score(train_data, train_target))
0.8666666666666667
BernoulliNB, MultinomialNB
orgData.head()
income | attractive | assets | edueduclass | Dated | income_rank | attractive_rank | assets_rank | |
---|---|---|---|---|---|---|---|---|
0 | 3000 | 9.0 | 5.145476 | 1 | 0 | 0 | 0 | 0 |
1 | 3000 | 14.5 | 40.643781 | 4 | 1 | 0 | 0 | 1 |
2 | 3000 | 6.0 | 5.145476 | 1 | 0 | 0 | 0 | 0 |
3 | 3000 | 1.0 | 7.067434 | 1 | 0 | 0 | 0 | 0 |
4 | 3500 | 14.5 | 3.728400 | 2 | 0 | 0 | 0 | 0 |
orgData1 = orgData.iloc[:, -3:]
train_data1, test_data1, train_target1, test_target1 = train_test_split(
orgData1, Y, test_size=0.3, train_size=0.7, random_state=123)
- 建模
nb = BernoulliNB(alpha=1)
mb = MultinomialNB(alpha=1)
nb.fit(train_data1, train_target1.values.flatten())
mb.fit(train_data1, train_target1.values.flatten())
test_est1 = nb.predict(test_data1)
test_est2 = mb.predict(test_data1)
- 验证
print(metrics.classification_report(test_target1, test_est1))
print(metrics.classification_report(test_target1, test_est2))
precision recall f1-score support
0 0.89 0.57 0.70 14
1 0.71 0.94 0.81 16
avg / total 0.80 0.77 0.76 30
precision recall f1-score support
0 0.91 0.71 0.80 14
1 0.79 0.94 0.86 16
avg / total 0.85 0.83 0.83 30
print(nb.score(train_data1, train_target1))
print(mb.score(train_data1, train_target1))
0.7571428571428571
0.8285714285714286