最近领域与朴素贝叶斯

KNN

字段含义
income收入
attractive吸引力
assets资产
edueduclass教育程度
dated是否约会成功
income_rank收入等级
attractive_rank吸引力等级
assets_rank资产等级
  • 加载数据集
import pandas as pd
import os

# os.chdir('Q:/data')
orgData = pd.read_csv('date_data2.csv')
orgData.describe()
incomeattractiveassetsedueduclassDatedincome_rankattractive_rankassets_rank
count100.000000100.000000100.000000100.000000100.000000100.000000100.000000100.000000
mean9010.00000050.50000096.0063003.7100000.5000001.5500001.5600001.510000
std5832.67528828.81094891.0822261.2251160.5025191.1403971.1038961.123621
min3000.0000001.0000003.7284001.0000000.0000000.0000000.0000000.000000
25%5000.00000028.00000031.6652693.0000000.0000001.0000001.0000000.750000
50%7500.00000051.00000070.7469244.0000000.5000002.0000002.0000002.000000
75%11500.00000068.875000131.4810614.0000001.0000003.0000002.2500002.250000
max34000.00000099.500000486.3117586.0000001.0000003.0000003.0000003.000000
  • 选取自变量
X = orgData.iloc[:, :4]
Y = orgData[['Dated']]
  • 极值标准化
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_scaled[1:5]
array([[0.        , 0.13705584, 0.07649535, 0.6       ],
       [0.        , 0.05076142, 0.00293644, 0.        ],
       [0.        , 0.        , 0.00691908, 0.        ],
       [0.01612903, 0.13705584, 0.        , 0.2       ]])
  • 划分训练集和测试集
from sklearn.model_selection import train_test_split

train_data, test_data, train_target, test_target = train_test_split(
    X_scaled, Y, test_size=0.25, train_size=0.75, random_state=123)
  • 建模
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)  # 默认欧氏距离
model.fit(train_data, train_target.values.flatten())

test_est = model.predict(test_data)
  • 验证
import sklearn.metrics as metrics

print(metrics.classification_report(test_target, test_est))
             precision    recall  f1-score   support

          0       0.92      0.92      0.92        12
          1       0.92      0.92      0.92        13

avg / total       0.92      0.92      0.92        25
model.score(test_data, test_target)
0.92
  • 选择k值
for k in range(1, 10):
    k_model = KNeighborsClassifier(n_neighbors=k)
    k_model.fit(train_data, train_target.values.flatten())
    score = k_model.score(test_data, test_target)
    print('When k=%s , the score is %.4f' %(k,  score))
When k=1 , the score is 0.9200
When k=2 , the score is 0.8800
When k=3 , the score is 0.9200
When k=4 , the score is 0.9200
When k=5 , the score is 0.8800
When k=6 , the score is 0.8800
When k=7 , the score is 0.9200
When k=8 , the score is 0.8800
When k=9 , the score is 0.9200
  • 交叉验证选择k值
from sklearn.model_selection import ParameterGrid, GridSearchCV

grid = ParameterGrid({'n_neighbors':[range(1,15)]})
estimator = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator, grid, cv=4, scoring='roc_auc')
knn_cv.fit(train_data, train_target.values.flatten())

knn_cv.best_params_
{'n_neighbors': 7}
knn_cv.score
<bound method BaseSearchCV.score of GridSearchCV(cv=4, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=<sklearn.model_selection._search.ParameterGrid object at 0x000000001169EC18>,
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)>
knn_cv.best_score_
0.9481481481481482
knn_cv.cv_results_
{'mean_fit_time': array([0.00050002, 0.00024998, 0.00049996, 0.00024998, 0.00050002,
        0.        , 0.        , 0.00024998, 0.00049996, 0.00050008,
        0.00025004, 0.00099993, 0.00175005, 0.00049996]),
 'mean_score_time': array([0.00075001, 0.00125009, 0.00075006, 0.00100011, 0.00100005,
        0.00100005, 0.00099999, 0.00075006, 0.00100017, 0.00099999,
        0.00100011, 0.00150013, 0.00200009, 0.00225013]),
 'mean_test_score': array([0.85244444, 0.90681481, 0.93562963, 0.92837037, 0.92688889,
        0.94325926, 0.94814815, 0.93088889, 0.92422222, 0.93185185,
        0.91585185, 0.91725926, 0.93296296, 0.94162963]),
 'mean_train_score': array([1.        , 0.98897083, 0.98438274, 0.9752876 , 0.9667673 ,
        0.97197502, 0.96794222, 0.96384385, 0.95800081, 0.95163813,
        0.95032495, 0.94546359, 0.94406611, 0.94205612]),
 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors': 6},
  {'n_neighbors': 7},
  {'n_neighbors': 8},
  {'n_neighbors': 9},
  {'n_neighbors': 10},
  {'n_neighbors': 11},
  {'n_neighbors': 12},
  {'n_neighbors': 13},
  {'n_neighbors': 14}],
 'rank_test_score': array([14, 13,  4,  8,  9,  2,  1,  7, 10,  6, 12, 11,  5,  3]),
 'split0_test_score': array([0.8  , 0.84 , 0.85 , 0.845, 0.86 , 0.895, 0.935, 0.94 , 0.915,
        0.955, 0.97 , 0.985, 0.99 , 0.995]),
 'split0_train_score': array([1.        , 0.99933862, 0.99470899, 0.99206349, 0.98544974,
        0.98743386, 0.98280423, 0.97883598, 0.96296296, 0.96031746,
        0.94378307, 0.9239418 , 0.91335979, 0.90806878]),
 'split1_test_score': array([0.73333333, 0.9       , 0.93888889, 0.93888889, 0.90555556,
        0.93333333, 0.92222222, 0.87222222, 0.87222222, 0.87777778,
        0.82222222, 0.79444444, 0.82777778, 0.83333333]),
 'split1_train_score': array([1.        , 0.98979592, 0.98469388, 0.98852041, 0.97767857,
        0.97066327, 0.97640306, 0.97321429, 0.97002551, 0.96938776,
        0.97512755, 0.96875   , 0.97066327, 0.96237245]),
 'split2_test_score': array([0.88888889, 0.90740741, 0.96296296, 0.95061728, 0.96296296,
        0.96296296, 0.98148148, 0.97530864, 0.98765432, 0.98148148,
        0.96296296, 0.98148148, 0.98765432, 1.        ]),
 'split2_train_score': array([1.        , 0.98522167, 0.98214286, 0.9612069 , 0.9544335 ,
        0.96859606, 0.95073892, 0.93965517, 0.93965517, 0.92610837,
        0.93226601, 0.93780788, 0.93780788, 0.94027094]),
 'split3_test_score': array([1.        , 0.98765432, 1.        , 0.98765432, 0.98765432,
        0.98765432, 0.95679012, 0.9382716 , 0.92592593, 0.91358025,
        0.90740741, 0.90740741, 0.92592593, 0.9382716 ]),
 'split3_train_score': array([1.        , 0.98152709, 0.97598522, 0.95935961, 0.94950739,
        0.9612069 , 0.96182266, 0.96366995, 0.95935961, 0.95073892,
        0.95012315, 0.95135468, 0.9544335 , 0.95751232]),
 'std_fit_time': array([0.00050002, 0.00043298, 0.00049996, 0.00043298, 0.00050002,
        0.        , 0.        , 0.00043298, 0.00049996, 0.00050008,
        0.00043308, 0.        , 0.00248759, 0.00049996]),
 'std_score_time': array([4.33015741e-04, 4.33015741e-04, 4.33050154e-04, 1.03238273e-07,
        1.19209290e-07, 1.19209290e-07, 1.03238273e-07, 4.33050154e-04,
        0.00000000e+00, 1.03238273e-07, 1.03238273e-07, 4.99963760e-04,
        7.07224118e-04, 1.08974147e-03]),
 'std_test_score': array([0.09935694, 0.05263757, 0.05596217, 0.05332738, 0.05006896,
        0.03481684, 0.02238234, 0.03715753, 0.04091529, 0.0394323 ,
        0.05960289, 0.07783317, 0.0663336 , 0.06746183]),
 'std_train_score': array([0.        , 0.00666403, 0.00675025, 0.01507071, 0.01515022,
        0.00959253, 0.01250868, 0.01498075, 0.01126527, 0.01614742,
        0.01568543, 0.01657377, 0.02119514, 0.02127189])}
metrics.roc_auc_score(test_target,knn_cv.predict(test_data))
0.9198717948717948

练习:试一试哪些参数会影响结果

朴素贝叶斯

GaussianNB

from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

gb = GaussianNB()
gb.fit(train_data, train_target.values.flatten())
gb_est = gb.predict(test_data)
print(metrics.classification_report(test_target, test_est))
             precision    recall  f1-score   support

          0       0.92      0.92      0.92        12
          1       0.92      0.92      0.92        13

avg / total       0.92      0.92      0.92        25
print(gb.score(train_data, train_target))
0.8666666666666667

BernoulliNB, MultinomialNB

orgData.head()
incomeattractiveassetsedueduclassDatedincome_rankattractive_rankassets_rank
030009.05.14547610000
1300014.540.64378141001
230006.05.14547610000
330001.07.06743410000
4350014.53.72840020000
orgData1 = orgData.iloc[:, -3:]

train_data1, test_data1, train_target1, test_target1 = train_test_split(
    orgData1, Y, test_size=0.3, train_size=0.7, random_state=123)
  • 建模
nb = BernoulliNB(alpha=1)
mb = MultinomialNB(alpha=1)

nb.fit(train_data1, train_target1.values.flatten())
mb.fit(train_data1, train_target1.values.flatten())

test_est1 = nb.predict(test_data1)
test_est2 = mb.predict(test_data1)
  • 验证
print(metrics.classification_report(test_target1, test_est1))
print(metrics.classification_report(test_target1, test_est2))
             precision    recall  f1-score   support

          0       0.89      0.57      0.70        14
          1       0.71      0.94      0.81        16

avg / total       0.80      0.77      0.76        30

             precision    recall  f1-score   support

          0       0.91      0.71      0.80        14
          1       0.79      0.94      0.86        16

avg / total       0.85      0.83      0.83        30
print(nb.score(train_data1, train_target1))
print(mb.score(train_data1, train_target1))
0.7571428571428571
0.8285714285714286
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值