kaggle_GiveMeSomeCredit_建模比赛

kaggle提交结果:
在这里插入图片描述
我的private score 0.86699, public score 0.86101,
榜单第一名private score 0.86955, public score 0.86390.

Importing libraries

import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

import scikitplot as skplt
import scipy

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix as cm, auc, roc_curve

Reading the data

data = pd.read_csv('cs-training.csv', index_col=0)

test_data = pd.read_csv('cs-test.csv', index_col=0)

sample_data = pd.read_csv('sampleEntry.csv', index_col=0)

EDA

data.head()
SeriousDlqin2yrsRevolvingUtilizationOfUnsecuredLinesageNumberOfTime30-59DaysPastDueNotWorseDebtRatioMonthlyIncomeNumberOfOpenCreditLinesAndLoansNumberOfTimes90DaysLateNumberRealEstateLoansOrLinesNumberOfTime60-89DaysPastDueNotWorseNumberOfDependents
110.7661274520.8029829120.0130602.0
200.9571514000.1218762600.040001.0
300.6581803810.0851133042.021000.0
400.2338103000.0360503300.050000.0
500.9072394910.02492663588.070100.0
test_data.head()
SeriousDlqin2yrsRevolvingUtilizationOfUnsecuredLinesageNumberOfTime30-59DaysPastDueNotWorseDebtRatioMonthlyIncomeNumberOfOpenCreditLinesAndLoansNumberOfTimes90DaysLateNumberRealEstateLoansOrLinesNumberOfTime60-89DaysPastDueNotWorseNumberOfDependents
1NaN0.8855194300.1775135700.040000.0
2NaN0.4632955700.5272379141.0150402.0
3NaN0.0432755900.6876485083.0120102.0
4NaN0.2803083810.9259613200.070200.0
5NaN1.0000002700.0199173865.040001.0
data.describe([0.01,0.1,0.25,.5,.75,.9,.99]).T
countmeanstdmin1%10%25%50%75%90%99%max
SeriousDlqin2yrs150000.00.0668400.2497460.00.00.0000000.0000000.0000000.0000000.0000001.0000001.0
RevolvingUtilizationOfUnsecuredLines150000.06.048438249.7553710.00.00.0029690.0298670.1541810.5590460.9812781.09295650708.0
age150000.052.29520714.7718660.024.033.00000041.00000052.00000063.00000072.00000087.000000109.0
NumberOfTime30-59DaysPastDueNotWorse150000.00.4210334.1927810.00.00.0000000.0000000.0000000.0000001.0000004.00000098.0
DebtRatio150000.0353.0050762037.8185230.00.00.0308740.1750740.3665080.8682541267.0000004979.040000329664.0
MonthlyIncome120269.06670.22123714384.6742150.00.02005.0000003400.0000005400.0000008249.00000011666.00000025000.0000003008750.0
NumberOfOpenCreditLinesAndLoans150000.08.4527605.1459510.00.03.0000005.0000008.00000011.00000015.00000024.00000058.0
NumberOfTimes90DaysLate150000.00.2659734.1693040.00.00.0000000.0000000.0000000.0000000.0000003.00000098.0
NumberRealEstateLoansOrLines150000.01.0182401.1297710.00.00.0000000.0000001.0000002.0000002.0000004.00000054.0
NumberOfTime60-89DaysPastDueNotWorse150000.00.2403874.1551790.00.00.0000000.0000000.0000000.0000000.0000002.00000098.0
NumberOfDependents146076.00.7572221.1150860.00.00.0000000.0000000.0000001.0000002.0000004.00000020.0
test_data.describe([0.01,0.1,0.25,.5,.75,.9,.99]).T
countmeanstdmin1%10%25%50%75%90%99%max
SeriousDlqin2yrs0.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
RevolvingUtilizationOfUnsecuredLines101503.05.310000196.1560390.00.00.0030080.0301310.1525860.5642250.9833421.0886921821.0
age101503.052.40543614.77975621.024.033.00000041.00000052.00000063.00000072.00000087.00000104.0
NumberOfTime30-59DaysPastDueNotWorse101503.00.4537704.5384870.00.00.0000000.0000000.0000000.0000001.0000004.0000098.0
DebtRatio101503.0344.4750201632.5952310.00.00.0300580.1734230.3642600.8516191238.8000004963.00000268326.0
MonthlyIncome81400.06855.03559036508.6003750.00.02083.0000003408.0000005400.0000008200.00000011500.00000025916.010007727000.0
NumberOfOpenCreditLinesAndLoans101503.08.4535145.1441000.00.03.0000005.0000008.00000011.00000015.00000025.0000085.0
NumberOfTimes90DaysLate101503.00.2966914.5158590.00.00.0000000.0000000.0000000.0000000.0000003.0000098.0
NumberRealEstateLoansOrLines101503.01.0130741.1102530.00.00.0000000.0000001.0000002.0000002.0000004.0000037.0
NumberOfTime60-89DaysPastDueNotWorse101503.00.2703174.5035780.00.00.0000000.0000000.0000000.0000000.0000002.0000098.0
NumberOfDependents98877.00.7690461.1367780.00.00.0000000.0000000.0000001.0000002.0000004.0000043.0
data.shape
(150000, 11)
test_data.shape
(101503, 11)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
test_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 101503 entries, 1 to 101503
Data columns (total 11 columns):
SeriousDlqin2yrs                        0 non-null float64
RevolvingUtilizationOfUnsecuredLines    101503 non-null float64
age                                     101503 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    101503 non-null int64
DebtRatio                               101503 non-null float64
MonthlyIncome                           81400 non-null float64
NumberOfOpenCreditLinesAndLoans         101503 non-null int64
NumberOfTimes90DaysLate                 101503 non-null int64
NumberRealEstateLoansOrLines            101503 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    101503 non-null int64
NumberOfDependents                      98877 non-null float64
dtypes: float64(5), int64(6)
memory usage: 9.3 MB

drop_duplicates

# 去重复
# 训练集去重复,测试集不要去重复
data.drop_duplicates(inplace=True)

data.info()

data.index = range(data.shape[0])

data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 149391 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs                        149391 non-null int64
RevolvingUtilizationOfUnsecuredLines    149391 non-null float64
age                                     149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    149391 non-null int64
DebtRatio                               149391 non-null float64
MonthlyIncome                           120170 non-null float64
NumberOfOpenCreditLinesAndLoans         149391 non-null int64
NumberOfTimes90DaysLate                 149391 non-null int64
NumberRealEstateLoansOrLines            149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    149391 non-null int64
NumberOfDependents                      145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149391 entries, 0 to 149390
Data columns (total 11 columns):
SeriousDlqin2yrs                        149391 non-null int64
RevolvingUtilizationOfUnsecuredLines    149391 non-null float64
age                                     149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    149391 non-null int64
DebtRatio                               149391 non-null float64
MonthlyIncome                           120170 non-null float64
NumberOfOpenCreditLinesAndLoans         149391 non-null int64
NumberOfTimes90DaysLate                 149391 non-null int64
NumberRealEstateLoansOrLines            149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    149391 non-null int64
NumberOfDependents                      145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.5 MB

Checking Null values

data.isnull().mean()
SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198207
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.026160
dtype: float64
data.nunique()
SeriousDlqin2yrs                             2
RevolvingUtilizationOfUnsecuredLines    125728
age                                         86
NumberOfTime30-59DaysPastDueNotWorse        16
DebtRatio                               114194
MonthlyIncome                            13594
NumberOfOpenCreditLinesAndLoans             58
NumberOfTimes90DaysLate                     19
NumberRealEstateLoansOrLines                28
NumberOfTime60-89DaysPastDueNotWorse        13
NumberOfDependents                          13
dtype: int64
test_data.isnull().mean()
SeriousDlqin2yrs                        1.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198053
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.025871
dtype: float64
# NumberOfDependents
data['NumberOfDependents'].fillna(int(data['NumberOfDependents'].mode()[0]), inplace=True)

data.isnull().mean()
SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198207
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.000000
dtype: float64
test_data['NumberOfDependents'].fillna(int(test_data['NumberOfDependents'].mode()[0]), inplace=True)

test_data.isnull().mean()
SeriousDlqin2yrs                        1.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198053
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.000000
dtype: float64
# MonthlyIncome
data['MonthlyIncome'].fillna(int(data['MonthlyIncome'].mean()), inplace=True)

data.isnull().mean()
SeriousDlqin2yrs                        0.0
RevolvingUtilizationOfUnsecuredLines    0.0
age                                     0.0
NumberOfTime30-59DaysPastDueNotWorse    0.0
DebtRatio                               0.0
MonthlyIncome                           0.0
NumberOfOpenCreditLinesAndLoans         0.0
NumberOfTimes90DaysLate                 0.0
NumberRealEstateLoansOrLines            0.0
NumberOfTime60-89DaysPastDueNotWorse    0.0
NumberOfDependents                      0.0
dtype: float64
test_data['MonthlyIncome'].fillna(int(test_data['MonthlyIncome'].mean()), inplace=True)

test_data.isnull().mean()
SeriousDlqin2yrs                        1.0
RevolvingUtilizationOfUnsecuredLines    0.0
age                                     0.0
NumberOfTime30-59DaysPastDueNotWorse    0.0
DebtRatio                               0.0
MonthlyIncome                           0.0
NumberOfOpenCreditLinesAndLoans         0.0
NumberOfTimes90DaysLate                 0.0
NumberRealEstateLoansOrLines            0.0
NumberOfTime60-89DaysPastDueNotWorse    0.0
NumberOfDependents                      0.0
dtype: float64

Visualization

sns.countplot(x='SeriousDlqin2yrs',data=data)
plt.show()

在这里插入图片描述

f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.3f',ax=ax)
plt.show()

在这里插入图片描述

auusual values

# ###################
# 训练集
data.loc[data['age']==0, 'age'] = int(data['age'].mean())


# 测试集也做同样的处理
test_data.loc[test_data['age']==0, 'age'] = int(test_data['age'].mean())

#划分训练集测试集
X = data.loc[:, data.columns!='SeriousDlqin2yrs'] 
y = data.loc[:,'SeriousDlqin2yrs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=116214)

# 上采样
smote = SMOTE(random_state=0)
s_X,s_y=smote.fit_sample(X_train,y_train)

LogisticRegression model

# 基础模型
LR = LogisticRegression()
LR.fit(X_train, y_train)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(LR.score(X_train, y_train), LR.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, LR.predict(s_X)),
                                                       roc_auc_score(y_test, LR.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, LR.predict_proba(s_X)[:,1]),
                                                               roc_auc_score(y_test, LR.predict_proba(X_test)[:,1])))
训练集精确度:0.933075,测试集精确度:0.9334666666666667

predict 训练集auc:0.5064304086881967,测试集auc:0.5078828573129228

predict_proba 训练集auc:0.6839255109393088,测试集auc:0.6871543091031289
# 基础模型2
LR_s = LogisticRegression()
LR_s.fit(s_X,s_y)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(LR_s.score(s_X,s_y), LR_s.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, LR_s.predict(s_X)),
                                                       roc_auc_score(y_test, LR_s.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, LR_s.predict_proba(s_X)[:,1]),
                                                               roc_auc_score(y_test, LR_s.predict_proba(X_test)[:,1])))
# FPR, TPR, _ = roc_curve(s_y, LR_s.predict_proba(s_X)[:,1])
# FPR_test, TPR_test, _ = roc_curve(y_test, LR_s.predict_proba(X_test)[:,1])
# print('训练集auc:{0},测试集auc:{1}\n'.format(auc(FPR, TPR),auc(FPR_test, TPR_test)))

# print('训练集混淆矩阵:\n{0},\n\n测试集混淆矩阵:\n{1}'.format(cm(s_y, y_pred_train_s,labels=[1,0]),
#                                                                cm(y_test, y_pred_s,labels=[1,0])))

skplt.metrics.plot_roc(y_test, pd.DataFrame(LR_s.predict_proba(X_test)), plot_micro=False, figsize=(6,6), plot_macro=False)
训练集精确度:0.6768451700485854,测试集精确度:0.8703333333333333

predict 训练集auc:0.6768451700485852,测试集auc:0.7351483859562692

predict_proba 训练集auc:0.7682496553604893,测试集auc:0.8034115289523501

在这里插入图片描述

# 随机网格搜索
LR_r = LogisticRegression()
param_dist = {
        'penalty':['l1', 'l2'],
        'C':[0, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100],
        'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter':range(1,300,50),
        }
LR_random = RandomizedSearchCV(LR_r,param_distributions=param_dist,cv=5,n_jobs = -1)
LR_random.fit(s_X,s_y)

best_est_LR = LR_random.best_estimator_
print('最优参数:{0}\n'.format(best_est_LR))

print('训练集精确度:{0},测试集精确度:{1}\n'.format(LR_random.score(s_X,s_y), LR_random.score(X_test, y_test)))

y_pred_train_sr = LR_random.predict(s_X)
y_pred_sr = LR_random.predict(X_test)
print('训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, y_pred_train_sr),roc_auc_score(y_test, y_pred_sr)))

print('训练集混淆矩阵:\n{0},\n\n测试集混淆矩阵:\n{1}'.format(cm(s_y, y_pred_train_sr,labels=[1,0]),
                                                               cm(y_test, y_pred_sr,labels=[1,0])))
最优参数:LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=251,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

训练集精确度:0.6722322449271221,测试集精确度:0.722

训练集auc:0.672232244927122,测试集auc:0.6570985182928633

训练集混淆矩阵:
[[68407 43561]
 [29838 82130]],

测试集混淆矩阵:
[[ 1161   833]
 [ 7507 20499]]
# 手动调参
auc_score = []
for i in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
    LR_m = LogisticRegression(solver=i)
    LR_m.fit(s_X,s_y)
    auc_score.append(roc_auc_score(y_test, LR_m.predict_proba(X_test)[:, 1]))

print(max(auc_score))
plt.plot(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],auc_score)
plt.show()
0.8034115289523501

在这里插入图片描述

# 手动调参
auc_score = []
for i in [0.1, 0.5, 1]:
    LR_m = LogisticRegression(solver='lbfgs', C=i)
    LR_m.fit(s_X,s_y)
    auc_score.append(roc_auc_score(y_test, LR_m.predict_proba(X_test)[:, 1]))
print(max(auc_score))
plt.plot([0.1, 0.5, 1],auc_score)
plt.show()
0.8072174013291749

在这里插入图片描述

# 手动调参
auc_score = []
for i in [0.01, 0.05, 0.1]:
    LR_m = LogisticRegression(solver='lbfgs', C=i)
    LR_m.fit(s_X,s_y)
    auc_score.append(roc_auc_score(y_test, LR_m.predict_proba(X_test)[:, 1]))
print(max(auc_score))
plt.plot([0.01, 0.05, 0.1],auc_score)
plt.show()
0.8072174013291749

在这里插入图片描述

# 手动调参
auc_score = []
for i in [50, 100, 150, 200]:
    LR_m = LogisticRegression(solver='lbfgs', C=0.1, max_iter=i)
    LR_m.fit(s_X,s_y)
    auc_score.append(roc_auc_score(y_test, LR_m.predict_proba(X_test)[:, 1]))
print(max(auc_score))
plt.plot([50, 100, 150, 200],auc_score)
plt.show()
0.8072174013291749

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(30, 120, 10):
    LR_m = LogisticRegression(solver='lbfgs', C=0.1, max_iter=i)
    LR_m.fit(s_X,s_y)
    auc_score.append(roc_auc_score(y_test, LR_m.predict_proba(X_test)[:, 1]))
print(list(range(30, 120, 10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(30, 120, 10),auc_score)
plt.show()
80 0.807376433735972

在这里插入图片描述

best_est_LR = LogisticRegression(solver='lbfgs', C=0.1, max_iter=80).fit(s_X,s_y)
df_test = test_data.drop('SeriousDlqin2yrs', axis=1)
sample_data["Probability"] = best_est_LR.predict_proba(df_test)[:,1]
print(sample_data.head())
sample_data.to_csv("submission_LR.csv",index=False)
    Probability
Id             
1      0.403650
2      0.392559
3      0.419110
4      0.487821
5      0.472049

Random Forest model

# 基础模型
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(RFC.score(X_train, y_train), RFC.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, RFC.predict(X_train)),
                                                       roc_auc_score(y_test, RFC.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, RFC.predict_proba(X_train)[:,1]),
                                                            roc_auc_score(y_test, RFC.predict_proba(X_test)[:,1])))
训练集精确度:0.9995916666666667,测试集精确度:0.9359

predict 训练集auc:0.997585340973915,测试集auc:0.5895362836348795

predict_proba 训练集auc:0.9999771540267908,测试集auc:0.8437090354115978
# 基础模型2
RFC_s = RandomForestClassifier()
RFC_s.fit(s_X,s_y)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(RFC_s.score(s_X,s_y), RFC_s.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, RFC_s.predict(s_X)),
                                                       roc_auc_score(y_test, RFC_s.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, RFC_s.predict_proba(s_X)[:,1]),
                                                            roc_auc_score(y_test, RFC_s.predict_proba(X_test)[:,1])))
训练集精确度:0.9993390968848243,测试集精确度:0.8908333333333334

predict 训练集auc:0.9993390968848243,测试集auc:0.6839441591216555

predict_proba 训练集auc:0.9999081457017259,测试集auc:0.8237175910363382
# 随机网格搜索
RFC_r = RandomForestClassifier()
param_dist = {
        'n_estimators':[10, 20, 30, 40, 50, 100, 150, 200, 300],
        'max_depth':range(2, 10, 1),
        'min_samples_leaf':range(2, 10, 1),
        }
RFC_random = RandomizedSearchCV(RFC_r,param_distributions=param_dist,cv=5,n_jobs = -1)
RFC_random.fit(s_X,s_y)

best_est_RFC = RFC_random.best_estimator_
print('最优参数:{0}\n'.format(best_est_RFC))

print('训练集精确度:{0},测试集精确度:{1}\n'.format(RFC_random.score(s_X,s_y), RFC_random.score(X_test, y_test)))

y_pred_train_sr = RFC_random.predict(s_X)
y_pred_sr = RFC_random.predict(X_test)
print('训练集auc:{0},测试集auc:{1}\n'.format(auc(s_y, y_pred_train_sr),auc(y_test, y_pred_sr)))

print('训练集混淆矩阵:\n{0},\n\n测试集混淆矩阵:\n{1}'.format(cm(s_y, y_pred_train_sr,labels=[1,0]),
                                                               cm(y_test, y_pred_sr,labels=[1,0])))
最优参数:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

训练集精确度:0.8192925526193122,测试集精确度:0.7796780347401184

训练集auc:0.8192925526193122,测试集auc:0.7544857947353516

训练集混淆矩阵:
[[94898 16658]
 [23660 87896]],

测试集混淆矩阵:
[[ 1489   564]
 [ 6019 21807]]
# 手动调参
auc_score = []
for i in range(50, 350, 50):
    RF_m = RandomForestClassifier(n_estimators=i,max_depth=5,min_samples_split=200,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(50, 350, 50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(50, 350, 50),auc_score)
plt.show()
250 0.8640378842017733

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(50, 350, 50):
    RF_m = RandomForestClassifier(n_estimators=i,max_depth=5,min_samples_split=200,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(s_X, s_y)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(50, 350, 50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(50, 350, 50),auc_score)
plt.show()
250 0.8389906830396209

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(200, 300, 10):
    RF_m = RandomForestClassifier(n_estimators=i,max_depth=5,min_samples_split=200,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(200, 300, 10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(200, 300, 10),auc_score)
plt.show()
230 0.8640656490645973

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(2,10,1):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=i,min_samples_split=200,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(2,10,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(2,10,1),auc_score)
plt.show()
9 0.8677676427124693

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(9,15,1):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=i,min_samples_split=200,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(9,15,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(9,15,1),auc_score)
plt.show()
12 0.8684788028299709

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(50,350,50):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=i,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(50,350,50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(50,350,50),auc_score)
plt.show()
50 0.8684788028299709

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(10,55,10):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=i,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(10,55,10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(10,55,10),auc_score)
plt.show()
10 0.8684788028299709

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(2,11,1):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=i,min_samples_leaf=100,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(2,11,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(2,11,1),auc_score)
plt.show()
2 0.8684788028299709

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(10, 110, 10):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=i,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(10, 110, 10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(10, 110, 10),auc_score)
plt.show()
40 0.8687270122873082

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(36, 46, 2):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=i,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(36, 46, 2))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(36, 46, 2),auc_score)
plt.show()
36 0.8691479440821931

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(30, 50, 2):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=i,
                                  max_features='sqrt',bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(30, 50, 2))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(30, 50, 2),auc_score)
plt.show()
36 0.8691479440821931

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(1,11,1):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=36,
                                  max_features=i/10,bootstrap=0.7,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(1,11,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(1,11,1),auc_score)
plt.show()
3 0.8691479440821931

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(1,11,1):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=36,
                                  max_features=0.3,bootstrap=i,n_jobs=-1,random_state=10,)
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(1,11,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(1,11,1),auc_score)
plt.show()
1 0.8691479440821931

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(1,15,2):
    RF_m = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=36,
                                  max_features=0.3,bootstrap=0.1,n_jobs=-1,random_state=10,class_weight={1:i})
    RF_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, RF_m.predict_proba(X_test)[:, 1]))
print(list(range(1,15,2))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(1,15,2),auc_score)
plt.show()
1 0.8691479440821931

在这里插入图片描述

best_est_RFC = RandomForestClassifier(n_estimators=230,max_depth=12,min_samples_split=2,min_samples_leaf=36,
                              max_features=0.3,bootstrap=0.1,n_jobs=-1,random_state=10,)
best_est_RFC.fit(X_train, y_train)
print(roc_auc_score(y_test, best_est_RFC.predict_proba(X_test)[:, 1]))
0.8691479440821931
df_test = test_data.drop('SeriousDlqin2yrs', axis=1)
sample_data["Probability"] = best_est_RFC.predict_proba(df_test)[:,1]
print(sample_data.head())
sample_data.to_csv("submission_RFC.csv",index=True)
    Probability
Id             
1      0.079361
2      0.036505
3      0.016210
4      0.064601
5      0.126260

Grandient Boosting model

# 基础模型
GB = GradientBoostingClassifier()
GB.fit(X_train, y_train)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(GB.score(X_train, y_train), GB.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, GB.predict(X_train)),
                                                       roc_auc_score(y_test, GB.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, GB.predict_proba(X_train)[:,1]),
                                                            roc_auc_score(y_test, GB.predict_proba(X_test)[:,1])))
训练集精确度:0.9389916666666667,测试集精确度:0.9377333333333333

predict 训练集auc:0.6017569654221069,测试集auc:0.5916827107760473

predict_proba 训练集auc:0.8685441072547121,测试集auc:0.8677365023729332
# 基础模型2
GB_s = GradientBoostingClassifier()
GB_s.fit(s_X,s_y)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(GB_s.score(s_X,s_y), GB_s.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, GB_s.predict(s_X)),
                                                       roc_auc_score(y_test, GB_s.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(s_y, GB_s.predict_proba(s_X)[:,1]),
                                                            roc_auc_score(y_test, GB_s.predict_proba(X_test)[:,1])))
训练集精确度:0.8654704915690197,测试集精确度:0.8668

predict 训练集auc:0.8654704915690197,测试集auc:0.7248715725122952

predict_proba 训练集auc:0.9424107237355832,测试集auc:0.8360759633753794
# 手动调参
auc_score = []
for i in range(50, 350, 50):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=i,subsample=0.7,min_samples_split=200,
                                        min_samples_leaf=100,max_depth=3,random_state=10,max_features='sqrt')
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(50, 350, 50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(50, 350, 50),auc_score)
plt.show()
250 0.8684184382039928

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(200, 300, 10):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=i,subsample=0.7,min_samples_split=200,
                                        min_samples_leaf=100,max_depth=3,random_state=10,max_features='sqrt')
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(200, 300, 10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(200, 300, 10),auc_score)
plt.show()
250 0.8684184382039928

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(2,10,1):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=200,
                                        min_samples_leaf=100,max_depth=i,random_state=10,max_features='sqrt')
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(2,10,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(2,10,1),auc_score)
plt.show()
4 0.8694069872976782

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(1,11,1):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=200,
                                        min_samples_leaf=100,max_depth=4,random_state=10,max_features=i/10)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(1,11,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(1,11,1),auc_score)
plt.show()
3 0.8694069872976782

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(10, 310, 50):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=200,
                                        min_samples_leaf=i,max_depth=4,random_state=10,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(10, 310, 50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(10, 310, 50),auc_score)
plt.show()
110 0.8694922785209158

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(70, 130, 10):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=200,
                                        min_samples_leaf=i,max_depth=4,random_state=10,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(70, 130, 10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(70, 130, 10),auc_score)
plt.show()
90 0.8697006000505265

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(10, 300, 50):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=i,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(10, 300, 50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(10, 300, 50),auc_score)
plt.show()
260 0.869784512431818

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(250, 410, 50):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=i,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(250, 410, 50))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(250, 410, 50),auc_score)
plt.show()
250 0.8696629236420251

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(250, 300, 10):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=i,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(250, 300, 10))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(250, 300, 10),auc_score)
plt.show()
260 0.869784512431818

在这里插入图片描述

# 手动调参
auc_score = []
for i in range(1,11,1):
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=i/10,min_samples_split=260,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print(list(range(1,11,1))[auc_score.index(max(auc_score))], max(auc_score))
plt.plot(range(1,11,1),auc_score)
plt.show()
7 0.869784512431818

在这里插入图片描述

# 手动调参
auc_score = []
for i in [10, 42, 100, 116214]:
    GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=260,
                                        min_samples_leaf=90,max_depth=4,random_state=i,max_features=0.3)
    GB_m.fit(X_train, y_train)
    auc_score.append(roc_auc_score(y_test, GB_m.predict_proba(X_test)[:, 1]))
print([10, 42, 100, 116214][auc_score.index(max(auc_score))], max(auc_score))
plt.plot([10, 42, 100, 116214],auc_score)
plt.show()
10 0.869784512431818

在这里插入图片描述

# 基础模型
GB_m = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=260,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
GB_m.fit(X_train, y_train)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(GB_m.score(X_train, y_train), GB_m.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, GB_m.predict(X_train)),
                                                       roc_auc_score(y_test, GB_m.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, GB_m.predict_proba(X_train)[:,1]),
                                                            roc_auc_score(y_test, GB_m.predict_proba(X_test)[:,1])))
训练集精确度:0.9400916666666667,测试集精确度:0.9371666666666667

predict 训练集auc:0.6089339590765261,测试集auc:0.5925436990826797

predict_proba 训练集auc:0.874079427703056,测试集auc:0.869784512431818
# 基础模型
GB_m = GradientBoostingClassifier(learning_rate=0.05,n_estimators=500,subsample=0.7,min_samples_split=260,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
GB_m.fit(X_train, y_train)
print('训练集精确度:{0},测试集精确度:{1}\n'.format(GB_m.score(X_train, y_train), GB_m.score(X_test, y_test)))

print('predict 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, GB_m.predict(X_train)),
                                                       roc_auc_score(y_test, GB_m.predict(X_test))))
print('predict_proba 训练集auc:{0},测试集auc:{1}\n'.format(roc_auc_score(y_train, GB_m.predict_proba(X_train)[:,1]),
                                                            roc_auc_score(y_test, GB_m.predict_proba(X_test)[:,1])))
训练集精确度:0.940275,测试集精确度:0.9373666666666667

predict 训练集auc:0.6067785694888352,测试集auc:0.5903218295893178

predict_proba 训练集auc:0.8745757049324849,测试集auc:0.8691238322551743
best_est_GB = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,subsample=0.7,min_samples_split=260,
                                        min_samples_leaf=90,max_depth=4,random_state=10,max_features=0.3)
best_est_GB.fit(X_train, y_train)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=4,
                           max_features=0.3, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=90, min_samples_split=260,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=10, subsample=0.7, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
df_test = test_data.drop('SeriousDlqin2yrs', axis=1)
sample_data["Probability"] = best_est_GB.predict_proba(df_test)[:,1]
print(sample_data.head())
sample_data.to_csv("submission_GB.csv",index=True)
    Probability
Id             
1      0.072539
2      0.043227
3      0.013998
4      0.064475
5      0.083119
  • 0
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值