简单建模
from sklearn.model_selection import KFold
predictors = [ 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDateDT']
X_train = train_data[predictors]
X_test = test_data[predictors]
y_train = train_data["isDefault"]
folds =5
seed = 2020
kf = KFold(n_splits=folds,shuffle=True,random_state=seed)
lightbgm进行建模
from sklearn.model_selection import train_test_split
import lightgbm as lgb
X_train_split,X_val,y_train_split,y_val=train_test_split(X_train,y_train,test_size=0.2)
train_matrix = lgb.Dataset(X_train_split,y_train_split)
valid_matrix = lgb.Dataset(X_val,y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,
}
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
一级标题
// A code block
var foo = 'bar';
roc_auc评分
from sklearn import metrics
from sklearn.metrics import roc_auc_score
val_pre_lag = model.predict(X_val,num_iteration=model.best_iteration)
fpr,tpr,threshold =metrics.roc_curve(y_val,val_pre_lag)
roc_auc = metrics.auc(fpr,tpr)
plt.plot(fpr,tpr,"b",label="VAL AUC=%0.4f"%roc_auc)
plt.plot([0,1],[0,1],"r--")
plt.legend(loc="best")
plt.show()
使用5折交叉验证进行模型性能评估
cv_scores=[]
for i,(train_index,valid_index) in enumerate(kf.split(X_train,y_train)):
print('************************************ {} ************************************'.format(str(i+1)))
X_train_split,X_val,y_train_split,y_val=train_test_split(X_train,y_train,test_size=0.2)
train_matrix = lgb.Dataset(X_train_split,y_train_split)
valid_matrix = lgb.Dataset(X_val,y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,}
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
val_pre_lag = model.predict(X_val,num_iteration=model.best_iteration)
cv_scores.append(roc_auc_score(y_val,val_pre_lag))
print(cv_scores)
print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))