[一周算法进阶]--任务三-模型融合

Task3.模型融合

用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分果。

1.导入相关包&读取数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder,Imputer

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

data_original=pd.read_csv('data.csv',skipinitialspace=True)
data=data_original.copy()
data.head(5)
Unnamed: 0custidtrade_nobank_card_nolow_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_day...loans_max_limitloans_avg_limitconsfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_day
05279185820180507115231274000000023057383卡号10.010.9900.900.550.313...2900.01688.01200.075.01.02.01200.01200.012.018.0
11053404720180507121002192000000023073000卡号10.020.9420001.281.000.458...3500.01758.015100.080.05.06.022800.09360.04.02.0
212284978720180507125159718000000023114911卡号10.040.9601.001.000.114...1600.01250.04200.087.01.01.04200.04200.02.06.0
313180970820180507121358683000000388283484卡号10.000.9620000.130.570.777...3200.01541.016300.080.05.05.030000.012180.02.04.0
414249982920180507115448545000000388205844卡号10.010.9900.461.000.175...2300.01630.08300.079.02.02.08400.08250.022.0120.0

5 rows × 90 columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

2. 删除无关特征

data.drop(['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no', 'source','id_name'], axis=1, inplace=True)
object_cols = [col for col in data.columns if data[col].dtypes == 'O']
data_obj=data[object_cols]
data_num=data.drop(object_cols,axis=1)

3.缺失值填充和热编码

#缺失值填充
imputer=Imputer(strategy='mean')
mean_num=imputer.fit_transform(data_num)
data_num=pd.DataFrame(mean_num,columns=data_num.columns)
data_obj.ffill(inplace=True)
#One-HotEncoder
encoder = LabelBinarizer()
reg_preference_1hot = encoder.fit_transform(data_obj[['reg_preference_for_trad']])
data_obj.drop(['reg_preference_for_trad'], axis=1, inplace=True)
reg_preference_df = pd.DataFrame(reg_preference_1hot, columns=encoder.classes_)
data_obj = pd.concat([data_obj, reg_preference_df], axis=1)

#['latest_query_time']  ['loans_latest_time']
data_obj['latest_query_time'] = pd.to_datetime(data_obj['latest_query_time'])
data_obj['latest_query_time_month'] = data_obj['latest_query_time'].dt.month
data_obj['latest_query_time_weekday'] = data_obj['latest_query_time'].dt.weekday

data_obj['loans_latest_time'] = pd.to_datetime(data_obj['loans_latest_time'])
data_obj['loans_latest_time_month'] = data_obj['loans_latest_time'].dt.month
data_obj['loans_latest_time_weekday'] = data_obj['loans_latest_time'].dt.weekday

data_obj = data_obj.drop(['latest_query_time', 'loans_latest_time'], axis=1)

data=pd.concat([data_num,data_obj],axis=1)
data.shape

(4754, 90)
#数据集分割
from sklearn.model_selection import train_test_split
y=data['status']
X=data.drop(['status'],axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)

5.IV值与RF特征选择

X_train=X_train[['trans_amount_increase_rate_lately', 'trans_activity_day',
       'first_transaction_time', 'historical_trans_amount',
       'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
       'top_trans_count_last_1_month', 'trans_top_time_last_1_month',
       'consume_top_time_last_1_month',
       'trans_fail_top_count_enum_last_1_month',
       'trans_fail_top_count_enum_last_6_month',
       'trans_fail_top_count_enum_last_12_month',
       'max_cumulative_consume_later_1_month', 'first_transaction_day',
       'trans_day_last_12_month', 'apply_score', 'loans_score', 'loans_count',
       'loans_settle_count', 'loans_overdue_count', 'latest_three_month_loan',
       'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc',
       'latest_one_month_fail', 'consfin_credit_limit', 'consfin_avg_limit',
       'latest_query_day', 'loans_latest_day']]
X_test=X_test[['trans_amount_increase_rate_lately', 'trans_activity_day',
       'first_transaction_time', 'historical_trans_amount',
       'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
       'top_trans_count_last_1_month', 'trans_top_time_last_1_month',
       'consume_top_time_last_1_month',
       'trans_fail_top_count_enum_last_1_month',
       'trans_fail_top_count_enum_last_6_month',
       'trans_fail_top_count_enum_last_12_month',
       'max_cumulative_consume_later_1_month', 'first_transaction_day',
       'trans_day_last_12_month', 'apply_score', 'loans_score', 'loans_count',
       'loans_settle_count', 'loans_overdue_count', 'latest_three_month_loan',
       'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc',
       'latest_one_month_fail', 'consfin_credit_limit', 'consfin_avg_limit',
       'latest_query_day', 'loans_latest_day']]
X_train.head(5)
trans_amount_increase_rate_latelytrans_activity_dayfirst_transaction_timehistorical_trans_amounthistorical_trans_dayrank_trad_1_monthtrans_amount_3_monthtop_trans_count_last_1_monthtrans_top_time_last_1_monthconsume_top_time_last_1_monthtrans_fail_top_count_enum_last_1_monthtrans_fail_top_count_enum_last_6_monthtrans_fail_top_count_enum_last_12_monthmax_cumulative_consume_later_1_monthfirst_transaction_daytrans_day_last_12_monthapply_scoreloans_scoreloans_countloans_settle_countloans_overdue_countlatest_three_month_loanhistory_suc_feehistory_fail_feelatest_one_month_suclatest_one_month_failconsfin_credit_limitconsfin_avg_limitlatest_query_dayloans_latest_day
1100.960.40520170217.0181770.0150.00.8515610.01.000.00.06.09.09.0220.0458.099.0535.0498.092.077.07.03.085.052.00.03.010600.08228.00.09.0
33940.870.20520170331.063350.074.00.6512200.00.4014.014.01.04.09.0470.0416.082.0540.0510.019.016.03.01.022.011.01.00.016300.07160.030.027.0
30521.980.20520141110.097190.093.00.4533280.00.3011.011.00.04.021.01950.01288.082.0516.0482.016.016.02.00.020.05.00.00.010400.010320.03.0137.0
4901.490.55520130817.0373700.0356.00.3061940.00.1015.015.08.08.08.03090.01738.082.0491.0448.040.022.07.03.040.078.00.010.06600.06418.020.051.0
11.280.45820160402.0302910.0224.00.3510590.00.0513.013.00.03.03.02100.0779.084.0653.0635.037.036.00.02.049.04.02.01.015100.09360.04.02.0

6.stacking模型融合

#数据归一化评价
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve
from mlxtend.classifier import StackingClassifier

lr_model = LogisticRegression(C = 0.1, penalty = 'l1')
svm_model = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
dt_model = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=80, max_depth=3, min_child_weight=5, 
                    gamma=0.2, subsample=0.8, colsample_bytree=0.8, reg_alpha=1e-5, 
                    objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgbm_model = LGBMClassifier(learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=11, 
                    gamma=0.1, subsample=0.5, colsample_bytree=0.9, reg_alpha=1e-5, 
                    nthread=4,scale_pos_weight=1, seed=27)
gbdt_model=GradientBoostingClassifier(n_estimators=100)
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100)

sclf_model = StackingClassifier(classifiers=[lgb_model, gbdt_model, rf_model], use_probas=True,
                          average_probas=False,
                          meta_classifier=lr_model)

models={'LR':lr_model, 'SVM':svm_model, 'DT':dt_model, 'GBDT':gbdt_model, 
        'XGBoost':xgb_model, 'LGBM':lgbm_model,'Stack':sclf_model}

---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

<ipython-input-13-bac9ce48d541> in <module>()
     12 from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
     13 from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve
---> 14 from mlxtend.classifier import StackingClassifier
     15 
     16 lr_model = LogisticRegression(C = 0.1, penalty = 'l1')


ModuleNotFoundError: No module named 'mlxtend'
df_result=pd.DataFrame(columns=('model','accuracy','precision','recall','f1_score','auc'))
row=0
#定义评价函数
def evaluate(y_pre,y):
    acc=accuracy_score(y,y_pre)
    p=precision_score(y,y_pre)
    r=recall_score(y,y_pre)
    f1=f1_score(y,y_pre)
    return acc,p,r,f1

for name,model in models.items():
    print(name,'start training...')
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    y_proba=model.predict_proba(X_test)
    acc,p,r,f1=evaluate(y_pred,y_test)
    auc=roc_auc_score(y_test,y_proba[:,1])
    df_result.loc[row]=[name,acc,p,r,f1,auc]
    row+=1
print(df_result)

ubuntu16不知怎么了,使用pip安装mlxten始终报错,先把程序写上,明天继续安装。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值