现在我们用传说中的xgboost 对这个数据集进行计算
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 19 13:19:26 2017
@author: luogan
"""
import pandas as pd
df = pd.read_csv('loans.csv')
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
dff =df.apply(lambda df: d[df.name].fit_transform(df))
dff.to_excel('dff.xls')
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
import matplotlib.pylab as plt
#%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
train = pd.read_excel('dff.xls')
target = 'safe_loans'
IDcol = 'id'
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['safe_loans'],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
from pandas import DataFrame
'''
gg=DataFrame(dtrain_predictions)
gg.to_excel('dtrain_predictions.xls')
tt=DataFrame(dtrain_predprob)
tt.to_excel('dtrain_predprob.xls')
'''
print(alg)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['safe_loans'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['safe_loans'], dtrain_predprob))
ww=(alg.feature_importances_)
print(ww)
feat_imp = pd.Series(ww).sort_values(ascending=False)
#print(feat_imp)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
"""
model=alg
featureImportance = model.get_score()
features = pd.DataFrame()
features['features'] = featureImportance.keys()
features['importance'] = featureImportance.values()
features.sort_values(by=['importance'],ascending=False,inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
plt.xticks(rotation=60)
#sn.barplot(data=features.head(30),x="features",y="importance",ax=ax,orient="v")
"""
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=18,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb1, train, predictors)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
Model Report
Accuracy : 0.9533
AUC Score (Train): 0.990971
- 1
- 2
- 3
正确率95%,甩决策树和BP网几条街啊!
可见传说中的xgboost果然厉害,难怪工业实践中xgboost 应用如痴的广泛
下图显示了每个 feature的重要性,里面有两个文件,请运行xgboost.py