def modelfit(model, X_train, X_test, y_train, y_test,useTrainCV, eval_metric='auc', cv_folds=4,
early_stopping_rounds=20):
#early_stopping_rounds 在20轮迭代里没有提升的话,就停止
print('Model eval_metric is % s' % eval_metric)
if useTrainCV:
xgb_param = model.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds)
#cvresult.to_csv(cvresult_path, index_label='n_estimators')
print("Best Iteration: %d" % cvresult.shape[0])
model.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
model.fit(X_train, y_train, eval_metric=eval_metric)
Report_Df=myscoring(model, X_train, y_train, X_test, y_test,threshold=0.5)
print(Report_Df)
return model
def myscoring(model, X_train, y_train, X_test, y_test,threshold=0.5):
print("threshold %s"%threshold)
# Predict training set:
predprob_train = model.predict_proba(X_train)[:, 1]
y_train_pred = (predprob_train >= threshold)*1
#predictions_train = model.predict(X_train)
# Predict test set:
predprob_test = model.predict_proba(X_test)[:, 1]
y_test_pred = (predprob_test >= threshold)*1
#predictions_test = model.predict(X_test)
# Print model report:
# print ( "\nModel Report" )
Report_Df = pd.DataFrame(
data={'evaluating indicator': ['Accuracy', 'AUC Score', 'Recall', 'F1-score', 'Precesion'],
'Train': [metrics.accuracy_score(y_train, y_train_pred), metrics.roc_auc_score(y_train, predprob_train),
metrics.recall_score(y_train, y_train_pred), metrics.f1_score(y_train, y_train_pred),
metrics.precision_score(y_train, y_train_pred)],
'Test': [metrics.accuracy_score(y_test, y_test_pred),
metrics.roc_auc_score(y_test, predprob_test),
metrics.recall_score(y_test, y_test_pred), metrics.f1_score(y_test, y_test_pred),
metrics.precision_score(y_test, y_test_pred)]}
)
# print(Report_Df[['evaluating indicator','Train','Test']])
return Report_Df[['evaluating indicator', 'Train', 'Test']]
##特征重要性输出
def features_importance(xgb,features):
features_importance=[]
for each in zip(features,xgb.feature_importances_):
features_importance.append({"feature":each[0],"importance":each[1]})
features_importance=pd.DataFrame(sorted(features_importance,key=(lambda x:x["importance"]),reverse=True))
features_importance=features_importance[features_importance["importance"]>0]
return features_importance
##给定基本参数,通过model_fit得到合适的n_estimator参数,
##网格调参或者贝叶斯调参
##定义一个简单的网格搜索
from sklearn.model_selection import GridSearchCV
def Gcv(X_train, X_test, y_train, y_test,model=XGBClassifier(scale_pos_weight=452/3573,objective='binary:logistic')
,params={'learning_rate ': [0.05,0.1,0.3],
'n_estimators':[50,60,70],
'max_depth':[3,4,5,6],
'subsample':[0.6,0.75],
'colsample_bytree':[0.5,0.6,0.7]
}):
# X_train, X_test,= X_train.reset_index(drop=True), X_test.reset_index(drop=True)
# y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)
#X_train, X_test, y_train, y_test = X_train.fillna(0), X_test.fillna(0), y_train.fillna(0), y_test.fillna(0)
#X_train, X_test, y_train, y_test = X_train.astype('float64'), X_test.astype('float64'), y_train.astype('float64'), y_test.astype('float64')
gsCv=GridSearchCV(model,param_grid=params)
gsCv.fit(X_train,y_train)
print(gsCv.best_score_)
print(gsCv.best_params_)
return gsCv.best_params_
def model_paramadjust_train(X_train, y_train,X_test, y_test,class_weight=1,thred=0.5):
params=Gcv(X_train, X_test, y_train,y_test)
model = XGBClassifier(scale_pos_weight=class_weight,objective='binary:logistic')
model.set_params(**params)
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",eval_set=[(X_test, y_test)])
#y_test_predict = model.predict_proba(X_test)[:, 1]
return model, myscoring(model,X_train, y_train, X_test, y_test,threshold=thred)
##训练集测试集指标计算
##特征重要性绘图
python xgboost 调参
最新推荐文章于 2024-05-05 00:06:07 发布