task --模型融合
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import joblib
path=r'C:\Users\Ilikeyou Isomeone\Desktop\E-learn\Datawhale\Risk_management\Rawdata'
train=pd.read_table(path+'\\train_result.xls',header=0,index_col=0,sep='\t')
test=pd.read_table(path+'\\test_result.xls',header=0,index_col=0,sep='\t')
Y_train=train['isDefault']
X_train=train.drop('isDefault',axis=1)
X_train_train,X_train_test,Y_train_train,Y_train_test=train_test_split(X_train,Y_train,test_size=0.2)
clf_RF=RandomForestClassifier()
clf_RF.fit(X_train,Y_train)
joblib.dump(clf_RF,path+'clf_RF.model')
Y_train_test_pre=clf_RF.predict(X_train_test)
fpr,tpr,threshold=metrics.roc_curve(Y_train_test,Y_train_test_pre)
roc_auc=metrics.roc_auc_score(fpr,tpr)
print('验证集随机森林roc_auc: ',roc_auc)
train_lgb = lgb.Dataset(X_train_train,label=Y_train_train)
valid_lgb = lgb.Dataset(X_train_test,label=Y_train_test)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,
}
model = lgb.train(params,train_set=train_lgb,valid_sets=valid_lgb,
num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
Y_train_test_pre=model.predict(X_train_test)
fpr,tpr,threshold=metrics.roc_curve(Y_train_test.values,Y_train_test_pre,pos_label=1.0)
fpr1=[0 if fpr[i] < threshold[i] else 1 for i in range(len(fpr))]
roc_auc=roc_auc_score(np.array(fpr1),np.array(tpr))
print("lightgbm's roc_auc: ", roc_auc)
joblib.dump(model,path+'lightgbm.model')
import pandas as pd
import numpy as np
import sklearn
import joblib
from sklearn.metrics import roc_auc_score
path=r'C:\Users\Ilikeyou Isomeone\Desktop\E-learn\Datawhale\Risk_management\Rawdata'
test=pd.read_table(path+'\\test_result.xls',header=0,index_col=0,sep='\t')
print(test.info())
test.drop('isDefault',axis=1,inplace=True)
test['purpose_13'] =0
clf_RF=joblib.load(path+'clf_RF.model')
y_test=clf_RF.predict(test)
print(y_test)
print(len(y_test))