特征选择 xgboost 和RF 参数对比
XGBoost | 解释 | RF |
max_depth | 数最大深度 | max_depth |
n_estimators | 树的个数,迭代次数 | n_estimators |
learning_rate | 学习率 | |
subsample | 随机 采样 比例【百分比】样本建议决策树 | |
colsample_bytree | 随机 采样 比例【百分比】样本建议决策树 | |
scale_pos_weight | 解决样本个数 不平衡的问题,正负样本比例【负:正】 | class_weight |
reg_lambda | L2正则化系数,默认为1 | |
gamma | 惩罚max_depth结点个数 前的参数 | |
是否采用验证【袋外】样本评估模型好坏,默认是FALSE | oob_score | |
模型最大特征数 | max_features | |
叶子结点最少样本数 | min_samples_leaf | |
random_state | 随机数 | random_state |
xgboost 调参以及筛选特征
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
param_test = {
# 'max_depth':range(5,15,1),
# 'min_child_weight':range(1,6,2),
# 'scale_pos_weight':[ i for i in range(10,int(scale_pos_weight),10)],
# 'gamma':[i / 10.0 for i in range(0,11,1)],
'reg_lambda':[i / 10.0 for i in range(10,200,20)]
}
gsearch = GridSearchCV(
estimator = XGBClassifier(
learning_rate = 0.1,
n_estimators = 140,
max_depth = 7,
min_child_weight = 1,
gamma = 0.7,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 20,
seed = random_state
),
param_grid = param_test,
scoring ='roc_auc',
cv = 5)
gsearch.fit(X, Y)
gsearch.best_params_
# 得到 所有 最优特征组合
model = XGBClassifier(
learning_rate = 0.1,
n_estimators = 140,
max_depth = 7,
min_child_weight = 1,
gamma = 0.7,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 20,
seed = random_state
)
# 输出重要程度
print(model.feature_importances_)
# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model)
pyplot.show()
# feature_importances_排序
from matplotlib import pyplot
pyplot.bar(train_set.columns[1:], model.feature_importances_)
pyplot.show()
imp_score = pd.DataFrame(data=model.feature_importances_,index=train_set.columns[1:],columns=['score']).sort_values(by='score',ascending = False)
print(imp_score)
imp_score['score'][:6]
# 画图显示 前N个特征
import matplotlib.pyplot as plt
import seaboarn as sns
feature_length = len([k for k in train_set if k not in excluded_features])
not_used_features = [k for k in train_set if k not in excluded_features and k not in keys]
N = 80
ax2 = (imp_scores.iloc[0:N][::-1]
.plot(kind='barh',
color=sns_colors[0],
title='Gain feature importances',
figsize=(20,15)))
ax2.grid(False, axis='y')
plt.show()
# 通过feature_importances 筛选特征
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
accuracy_max = 0
n_min = 0
threshshold_min = 0
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(train_set[[k for k in train_set if k not in excluded_features]])
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, train_set['label'])
# eval model
select_X_test = selection.transform(test_set[[k for k in test_set if k not in excluded_features]])
y_pred = selection_model.predict(select_X_test)
y_test = test_set['label']
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
auc = metrics.auc(fpr, tpr)
# print('fpr:{0}_tpr:{1}'.format(fpr,tpr))
cm = metrics.confusion_matrix(y_test, predictions)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
# tpr = TP / (TP + FN)
fpr = FP / (FP + TN)
print('AUC:{0},fp:{1},tp:{2},recall:{3},fpr:{4}'.format(round(auc,5),FP,TP,round(recall,5),round(fpr,5)))
# if accuracy_max > accuracy:
# continue
# else:
# accuracy_max = accuracy
# n_min = select_X_train.shape[1]
# threshshold_min = thresh
print("Thresh=%.3f, n=%d, AUC=%.3f, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],auc,accuracy*100.0))
# 观察 在验证集上的表现【评估指标】,来筛选前几个特征。目的是 利用尽量少的特征,达到评估指标相对较好的表现。
N = 46 # 46个特征。
feature_columns = imp_score['score'].index[:46].tolist()
RandomForestClassifier 调参以及特征筛选
# 调参
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_test = {
# 'max_depth':range(5,15,1),
# 'n_estimators':range(150,300,10),
# 'min_child_weight':range(1,6,2),
# 'scale_pos_weight':[ i for i in range(10,int(scale_pos_weight),10)],
# 'gamma':[i / 10.0 for i in range(0,11,1)],
# 'reg_lambda':[i / 10.0 for i in range(10,200,20)]
'class_weight' :[{0:30,1:1},{0:40,1:1}] # 不平衡类别权重
}
gsearch = GridSearchCV(
estimator = RandomForestClassifier(
# max_depth = 7 ,# 树 深度
# n_estimators = 300 # 树 的 个数,即 迭代的次数
# oob_score = True # 是否采用验证【袋外】样本评估模型好坏,默认是FALSE
# max_features = 50 # 最大特征数
# min_samples_split = 20 # 限制子树继续划分的条件,如果结点的样本数小于【该值】,不会尝试选择最优特征划分
# min_samples_leaf = 20 # 叶子结点最少样本数
# # class_weight = [{"0":30,"1":1},{"0":40,"1":1}] # 不平衡类别权重
# seed = 7
),
param_grid = param_test,
scoring ='roc_auc',
cv = 5)
gsearch.fit(train_set[[k for k in train_set if k not in excluded_features]], train_set["label"])
gsearch.best_params_
# 得到 所有 最优特征组合
model = RandomForestClassifier(
# max_depth = 7 ,# 树 深度
# n_estimators = 300 # 树 的 个数,即 迭代的次数
# oob_score = True # 是否采用验证【袋外】样本评估模型好坏,默认是FALSE
# max_features = 50 # 最大特征数
# min_samples_split = 20 # 限制子树继续划分的条件,如果结点的样本数小于【该值】,不会尝试选择最优特征划分
# min_samples_leaf = 20 # 叶子结点最少样本数
# # class_weight = [{"0":30,"1":1},{"0":40,"1":1}] # 不平衡类别权重
# seed = 7,
seed = random_state
)
model.fit(X,Y)
# 输出重要程度
print(model.feature_importances_)
# 特征显示
from matplotlib import pyplot
pyplot.bar(train_set.columns[1:], model.feature_importances_)
pyplot.show()
imp_score = pd.DataFrame(data=model.feature_importances_,index=train_set.columns[1:],columns=['score']).sort_values(by='score',ascending = False)
print(imp_score)
imp_score['score'][:6]
# 获取表现分
keys = list(imp_score.index)
values = list(imp_score['score'].values)
imp_scores = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
# 画图显示 前N个特征
N = 80
ax2 = (imp_scores.iloc[0:N][::-1]
.plot(kind='barh',
color=sns_colors[0],
title='RF feature importances',
figsize=(20,15)))
ax2.grid(False, axis='y')
plt.show()
# 通过feature_importances 筛选特征
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
accuracy_max = 0
n_min = 0
threshshold_min = 0
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(train_set[[k for k in train_set if k not in excluded_features]])
# train model
selection_model = RandomForestClassifier()
selection_model.fit(select_X_train, train_set['label'])
# eval model
select_X_test = selection.transform(test_set[[k for k in test_set if k not in excluded_features]])
y_pred = selection_model.predict(select_X_test)
y_test = test_set['label']
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
auc = metrics.auc(fpr, tpr)
# print('fpr:{0}_tpr:{1}'.format(fpr,tpr))
cm = metrics.confusion_matrix(y_test, predictions)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
# tpr = TP / (TP + FN)
fpr = FP / (FP + TN)
print('AUC:{0},fp:{1},tp:{2},recall:{3},fpr:{4}'.format(round(auc,5),FP,TP,round(recall,5),round(fpr,5)))
# if accuracy_max > accuracy:
# continue
# else:
# accuracy_max = accuracy
# n_min = select_X_train.shape[1]
# threshshold_min = thresh
print("Thresh=%.3f, n=%d, AUC=%.3f, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],auc,accuracy*100.0))
# 观察 在验证集上的表现【评估指标】,来筛选前几个特征。目的是 利用尽量少的特征,达到评估指标相对较好的表现。
N = 46 # 46个特征。
feature_columns = imp_score['score'].index[:46].tolist()